241 files changed, 12237 insertions, 4224 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 8857f86890c..b44e91751ad 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -1,3 +1,4 @@
+remove_extra_strict_flags()
 
 set(INC
 	.
@@ -11,9 +12,20 @@ set(INC_SYS
 )
 
 set(SRC
-	kernel.cpp
-	kernel.cl
-	kernel.cu
+	kernels/cpu/kernel.cpp
+	kernels/opencl/kernel.cl
+	kernels/opencl/kernel_data_init.cl
+	kernels/opencl/kernel_queue_enqueue.cl
+	kernels/opencl/kernel_scene_intersect.cl
+	kernels/opencl/kernel_lamp_emission.cl
+	kernels/opencl/kernel_background_buffer_update.cl
+	kernels/opencl/kernel_shader_eval.cl
+	kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+	kernels/opencl/kernel_direct_lighting.cl
+	kernels/opencl/kernel_shadow_blocked.cl
+	kernels/opencl/kernel_next_iteration_setup.cl
+	kernels/opencl/kernel_sum_all_radiance.cl
+	kernels/cuda/kernel.cu
 )
 
 set(SRC_HEADERS
@@ -24,6 +36,7 @@ set(SRC_HEADERS
 	kernel_compat_cpu.h
 	kernel_compat_cuda.h
 	kernel_compat_opencl.h
+	kernel_debug.h
 	kernel_differential.h
 	kernel_emission.h
 	kernel_film.h
@@ -34,17 +47,22 @@ set(SRC_HEADERS
 	kernel_montecarlo.h
 	kernel_passes.h
 	kernel_path.h
+	kernel_path_branched.h
+	kernel_path_common.h
 	kernel_path_state.h
 	kernel_path_surface.h
 	kernel_path_volume.h
 	kernel_projection.h
+	kernel_queues.h
 	kernel_random.h
 	kernel_shader.h
+	kernel_shaderdata_vars.h
 	kernel_shadow.h
 	kernel_subsurface.h
 	kernel_textures.h
 	kernel_types.h
 	kernel_volume.h
+	kernel_work_stealing.h
 )
 
 set(SRC_CLOSURE_HEADERS
@@ -61,12 +79,12 @@ set(SRC_CLOSURE_HEADERS
 	closure/bsdf_transparent.h
 	closure/bsdf_util.h
 	closure/bsdf_ashikhmin_shirley.h
-	closure/bsdf_westin.h
 	closure/bsdf_hair.h
 	closure/bssrdf.h
 	closure/emissive.h
 	closure/volume.h
 )
+
 set(SRC_SVM_HEADERS
 	svm/svm.h
 	svm/svm_attribute.h
@@ -91,6 +109,7 @@ set(SRC_SVM_HEADERS
 	svm/svm_magic.h
 	svm/svm_mapping.h
 	svm/svm_math.h
+	svm/svm_math_util.h
 	svm/svm_mix.h
 	svm/svm_musgrave.h
 	svm/svm_noise.h
@@ -106,6 +125,7 @@ set(SRC_SVM_HEADERS
 	svm/svm_value.h
 	svm/svm_vector_transform.h
 	svm/svm_voronoi.h
+	svm/svm_voxel.h
 	svm/svm_wave.h
 )
 
@@ -116,22 +136,48 @@ set(SRC_GEOM_HEADERS
 	geom/geom_bvh_shadow.h
 	geom/geom_bvh_subsurface.h
 	geom/geom_bvh_traversal.h
+	geom/geom_bvh_volume.h
+	geom/geom_bvh_volume_all.h
 	geom/geom_curve.h
 	geom/geom_motion_curve.h
 	geom/geom_motion_triangle.h
 	geom/geom_object.h
 	geom/geom_primitive.h
+	geom/geom_qbvh.h
+	geom/geom_qbvh_shadow.h
+	geom/geom_qbvh_subsurface.h
+	geom/geom_qbvh_traversal.h
+	geom/geom_qbvh_volume.h
+	geom/geom_qbvh_volume_all.h
 	geom/geom_triangle.h
+	geom/geom_triangle_intersect.h
 	geom/geom_volume.h
 )
 
 set(SRC_UTIL_HEADERS
+	../util/util_atomic.h
 	../util/util_color.h
 	../util/util_half.h
 	../util/util_math.h
+	../util/util_math_fast.h
 	../util/util_transform.h
 	../util/util_types.h
 )
+
+set(SRC_SPLIT_HEADERS
+	split/kernel_background_buffer_update.h
+	split/kernel_data_init.h
+	split/kernel_direct_lighting.h
+	split/kernel_holdout_emission_blurring_pathtermination_ao.h
+	split/kernel_lamp_emission.h
+	split/kernel_next_iteration_setup.h
+	split/kernel_scene_intersect.h
+	split/kernel_shader_eval.h
+	split/kernel_shadow_blocked.h
+	split/kernel_split_common.h
+	split/kernel_sum_all_radiance.h
+)
+
 # CUDA module
 
 if(WITH_CYCLES_CUDA_BINARIES)
@@ -143,7 +189,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 
 	# CUDA version
-	execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
+	execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
 	string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${NVCC_OUT})
 	string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${NVCC_OUT})
 	set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
@@ -157,18 +203,24 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 
 	# build for each arch
-	set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
+	set(cuda_sources kernels/cuda/kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
 	set(cuda_cubins)
 
 	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
 		if(${experimental})
-			set(cuda_extra_flags "-D__KERNEL_CUDA_EXPERIMENTAL__")
+			set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
 			set(cuda_cubin kernel_experimental_${arch}.cubin)
 		else()
 			set(cuda_extra_flags "")
 			set(cuda_cubin kernel_${arch}.cubin)
 		endif()
 
+		if(WITH_CYCLES_DEBUG)
+			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
+		else()
+			set(cuda_debug_flags "")
+		endif()
+
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
 		set(cuda_math_flags "--use_fast_math")
 
@@ -177,13 +229,14 @@ if(WITH_CYCLES_CUDA_BINARIES)
 			COMMAND ${CUDA_NVCC_EXECUTABLE}
 					-arch=${arch}
 					-m${CUDA_BITS}
-					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
 					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
 					--ptxas-options="-v"
 					${cuda_arch_flags}
 					${cuda_version_flags}
 					${cuda_math_flags}
 					${cuda_extra_flags}
+					${cuda_debug_flags}
 					-I${CMAKE_CURRENT_SOURCE_DIR}/../util
 					-I${CMAKE_CURRENT_SOURCE_DIR}/svm
 					-DCCL_NAMESPACE_BEGIN=
@@ -196,6 +249,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		list(APPEND cuda_cubins ${cuda_cubin})
 
 		unset(cuda_extra_flags)
+		unset(cuda_debug_flags)
 	endmacro()
 
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
@@ -223,20 +277,29 @@ include_directories(SYSTEM ${INC_SYS})
 
 if(CXX_HAS_SSE)
 	list(APPEND SRC
-		kernel_sse2.cpp
-		kernel_sse3.cpp
-		kernel_sse41.cpp
-		kernel_avx.cpp
-		kernel_avx2.cpp
+		kernels/cpu/kernel_sse2.cpp
+		kernels/cpu/kernel_sse3.cpp
+		kernels/cpu/kernel_sse41.cpp
 	)
 
-	set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-	set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-	set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-	set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-	set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
+if(CXX_HAS_AVX)
+	list(APPEND SRC
+		kernels/cpu/kernel_avx.cpp
+	)
+	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+endif()
+
+if(CXX_HAS_AVX2)
+	list(APPEND SRC
+		kernels/cpu/kernel_avx2.cpp
+	)
+	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+endif()
 
 add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS})
 
@@ -254,11 +317,23 @@ endif()
 #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
 
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cl" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split)
 
diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript
index 5a9e57c5342..e8d51013924 100644
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@@ -57,8 +57,9 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     build_dir = os.path.join(root_build_dir, 'intern/cycles/kernel')
 
     # source directories and files
+    kernel_file_rel = os.path.join("kernels", "cuda", "kernel.cu")
     source_dir = Dir('.').srcnode().path
-    kernel_file = os.path.join(source_dir, "kernel.cu")
+    kernel_file = os.path.join(source_dir, kernel_file_rel)
     util_dir = os.path.join(source_dir, "../util")
     svm_dir = os.path.join(source_dir, "../svm")
     geom_dir = os.path.join(source_dir, "../geom")
@@ -79,12 +80,15 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     nvcc_flags += " -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC"
     nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, geom_dir, closure_dir)
 
+    if env['WITH_BF_CYCLES_DEBUG']:
+        nvcc_flags += " -D__KERNEL_DEBUG__"
+
     # dependencies
-    dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
+    dependencies = [kernel_file_rel] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
     last_cubin_file = None
 
     configs = (("kernel_%s.cubin", ''),
-               ("kernel_experimental_%s.cubin", ' -D__KERNEL_CUDA_EXPERIMENTAL__'))
+               ("kernel_experimental_%s.cubin", ' -D__KERNEL_EXPERIMENTAL__'))
 
     # add command for each cuda architecture
     for arch in cuda_archs:
@@ -102,7 +106,7 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
             else:
                 command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, current_flags, kernel_file, cubin_file)
 
-            kernel.Command(cubin_file, 'kernel.cu', command)
+            kernel.Command(cubin_file, kernel_file_rel, command)
             kernel.Depends(cubin_file, dependencies)
 
             kernel_binaries.append(cubin_file)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 9961071c2ac..558aa0dc6a9 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "../closure/bsdf_ashikhmin_velvet.h"
@@ -24,7 +24,6 @@
 #include "../closure/bsdf_refraction.h"
 #include "../closure/bsdf_transparent.h"
 #include "../closure/bsdf_ashikhmin_shirley.h"
-#include "../closure/bsdf_westin.h"
 #include "../closure/bsdf_toon.h"
 #include "../closure/bsdf_hair.h"
 #ifdef __SUBSURFACE__
@@ -48,87 +47,79 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader
 	switch(sc->type) {
 		case CLOSURE_BSDF_DIFFUSE_ID:
 		case CLOSURE_BSDF_BSSRDF_ID:
-			label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __SVM__
 		case CLOSURE_BSDF_OREN_NAYAR_ID:
-			label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		/*case CLOSURE_BSDF_PHONG_RAMP_ID:
-			label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-			label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;*/
 		case CLOSURE_BSDF_TRANSLUCENT_ID:
-			label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFLECTION_ID:
-			label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFRACTION_ID:
-			label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_TRANSPARENT_ID:
-			label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-			label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-			label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-			label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-			label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-			label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_GLOSSY_TOON_ID:
-			label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
-				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
-			break;
-		case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
-			label = bsdf_westin_backscatter_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
-				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
-			break;
-		case CLOSURE_BSDF_WESTIN_SHEEN_ID:
-			label = bsdf_westin_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-			label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-			label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-			label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		default:
@@ -148,73 +139,67 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 		return OSLShader::bsdf_eval(sd, sc, omega_in, *pdf);
 #endif
 
-	if(dot(sd->Ng, omega_in) >= 0.0f) {
+	if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) {
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			/*case CLOSURE_BSDF_PHONG_RAMP_ID:
-				eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-				eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;*/
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
-				break;
-			case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
-				eval = bsdf_westin_backscatter_eval_reflect(sc, sd->I, omega_in, pdf);
-				break;
-			case CLOSURE_BSDF_WESTIN_SHEEN_ID:
-				eval = bsdf_westin_sheen_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 			default:
@@ -226,63 +211,57 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
-				break;
-			case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
-				eval = bsdf_westin_backscatter_eval_transmit(sc, sd->I, omega_in, pdf);
-				break;
-			case CLOSURE_BSDF_WESTIN_SHEEN_ID:
-				eval = bsdf_westin_sheen_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 			default:
@@ -296,6 +275,8 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 
 ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 {
+/* ToDo: do we want to blur volume closures? */
+
 #ifdef __OSL__
 	if(kg->osl && sc->prim) {
 		OSLShader::bsdf_blur(sc, roughness);
@@ -303,33 +284,8 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 	}
 #endif
 
-	switch(sc->type) {
-		case CLOSURE_BSDF_DIFFUSE_ID:
-		case CLOSURE_BSDF_BSSRDF_ID:
-			bsdf_diffuse_blur(sc, roughness);
-			break;
 #ifdef __SVM__
-		case CLOSURE_BSDF_OREN_NAYAR_ID:
-			bsdf_oren_nayar_blur(sc, roughness);
-			break;
-		/*case CLOSURE_BSDF_PHONG_RAMP_ID:
-			bsdf_phong_ramp_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-			bsdf_diffuse_ramp_blur(sc, roughness);
-			break;*/
-		case CLOSURE_BSDF_TRANSLUCENT_ID:
-			bsdf_translucent_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_REFLECTION_ID:
-			bsdf_reflection_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_REFRACTION_ID:
-			bsdf_refraction_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_TRANSPARENT_ID:
-			bsdf_transparent_blur(sc, roughness);
-			break;
+	switch(sc->type) {
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
@@ -344,30 +300,10 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
 			bsdf_ashikhmin_shirley_blur(sc, roughness);
 			break;
-		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-			bsdf_ashikhmin_velvet_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-			bsdf_diffuse_toon_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_GLOSSY_TOON_ID:
-			bsdf_glossy_toon_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
-			bsdf_westin_backscatter_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_WESTIN_SHEEN_ID:
-			bsdf_westin_sheen_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-			bsdf_hair_reflection_blur(sc, roughness);
-			break;
-#endif
-		/* todo: do we want to blur volume closures? */
 		default:
 			break;
 	}
+#endif
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index ad7864cb8ea..8d7d533d6f8 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2013 Blender Foundation
+ * Copyright 2011-2014 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __BSDF_ASHIKHMIN_SHIRLEY_H__
@@ -33,24 +33,20 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device int bsdf_ashikhmin_shirley_setup(ShaderClosure *sc)
 {
-	/* store roughness. could already convert to exponent to save some cycles
-	 * in eval, but this is more consistent with other bsdfs and shader_blur. */
 	sc->data0 = clamp(sc->data0, 1e-4f, 1.0f);
 	sc->data1 = sc->data0;
 
 	sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID;
-	return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_ashikhmin_shirley_aniso_setup(ShaderClosure *sc)
 {
-	/* store roughness. could already convert to exponent to save some cycles
-	 * in eval, but this is more consistent with other bsdfs and shader_blur. */
 	sc->data0 = clamp(sc->data0, 1e-4f, 1.0f);
 	sc->data1 = clamp(sc->data1, 1e-4f, 1.0f);
 
 	sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID;
-	return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_ashikhmin_shirley_blur(ShaderClosure *sc, float roughness)
@@ -73,7 +69,10 @@ ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, c
 
 	float out = 0.0f;
 
-	if (NdotI > 0.0f && NdotO > 0.0f) {
+	if(fmaxf(sc->data0, sc->data1) <= 1e-4f)
+		return make_float3(0.0f, 0.0f, 0.0f);
+
+	if(NdotI > 0.0f && NdotO > 0.0f) {
 		NdotI = fmaxf(NdotI, 1e-6f);
 		NdotO = fmaxf(NdotO, 1e-6f);
 		float3 H = normalize(omega_in + I);
@@ -86,7 +85,8 @@ ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, c
 		float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0);
 		float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1);
 
-		if (n_x == n_y) {  /* => isotropic case */
+		if(n_x == n_y) {
+			/* isotropic */
 			float e = n_x;
 			float lobe = powf(HdotN, e);
 			float norm = (n_x + 1.0f) / (8.0f * M_PI_F);
@@ -94,7 +94,8 @@ ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, c
 			out = NdotO * norm * lobe * pump;
 			*pdf = norm * lobe / HdotI; /* this is p_h / 4(H.I)  (conversion from 'wh measure' to 'wi measure', eq. 8 in paper) */
 		}
-		else {             /* => ANisotropic case */
+		else {
+			/* anisotropic */
 			float3 X, Y;
 			make_orthonormals_tangent(N, sc->T, &X, &Y);
 
@@ -130,7 +131,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 	float3 N = sc->N;
 
 	float NdotI = dot(N, I);
-	if (NdotI > 0.0f) {
+	if(NdotI > 0.0f) {
 
 		float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0);
 		float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1);
@@ -146,21 +147,23 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 		/* sample spherical coords for h in tangent space */
 		float phi;
 		float cos_theta;
-		if (n_x == n_y) {  /* => simple isotropic sampling */
+		if(n_x == n_y) {
+			/* isotropic sampling */
 			phi = M_2PI_F * randu;
 			cos_theta = powf(randv, 1.0f / (n_x + 1.0f));
 		}
-		else {             /* => more complex anisotropic sampling */
-			if (randu < 0.25f) {      /* first quadrant */
+		else {
+			/* anisotropic sampling */
+			if(randu < 0.25f) {      /* first quadrant */
 				float remapped_randu = 4.0f * randu;
 				bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
 			}
-			else if (randu < 0.5f) {  /* second quadrant */
+			else if(randu < 0.5f) {  /* second quadrant */
 				float remapped_randu = 4.0f * (.5f - randu);
 				bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
 				phi = M_PI_F - phi;
 			}
-			else if (randu < 0.75f) { /* third quadrant */
+			else if(randu < 0.75f) { /* third quadrant */
 				float remapped_randu = 4.0f * (randu - 0.5f);
 				bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
 				phi = M_PI_F + phi;
@@ -185,14 +188,20 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 		/* half vector to world space */
 		float3 H = h.x*X + h.y*Y + h.z*N;
 		float HdotI = dot(H, I);
-		if (HdotI < 0.0f) H = -H;
+		if(HdotI < 0.0f) H = -H;
 
 		/* reflect I on H to get omega_in */
 		*omega_in = -I + (2.0f * HdotI) * H;
 
-		/* leave the rest to eval_reflect */
-		/* (could maybe optimize a few things by manual inlining, but I doubt it would make much difference) */
-		*eval = bsdf_ashikhmin_shirley_eval_reflect(sc, I, *omega_in, pdf);
+		if(fmaxf(sc->data0, sc->data1) <= 1e-4f) {
+			/* Some high number for MIS. */
+			*pdf = 1e6f;
+			*eval = make_float3(1e6f, 1e6f, 1e6f);
+		}
+		else {
+			/* leave the rest to eval_reflect */
+			*eval = bsdf_ashikhmin_shirley_eval_reflect(sc, I, *omega_in, pdf);
+		}
 
 #ifdef __RAY_DIFFERENTIALS__
 		/* just do the reflection thing for now */
@@ -201,7 +210,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 #endif
 	}
 
-	return LABEL_REFLECT | LABEL_GLOSSY;
+	return LABEL_REFLECT|LABEL_GLOSSY;
 }
 
 
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 3631f90bf8c..f1a26650078 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -45,10 +45,6 @@ ccl_device int bsdf_ashikhmin_velvet_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device void bsdf_ashikhmin_velvet_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float m_invsigma2 = sc->data0;
@@ -63,7 +59,7 @@ ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, co
 		float cosHO = fabsf(dot(I, H));
 
 		if(!(fabsf(cosNH) < 1.0f-1e-5f && cosHO > 1e-5f))
-			return make_float3(0, 0, 0);
+			return make_float3(0.0f, 0.0f, 0.0f);
 
 		float cosNHdivHO = cosNH / cosHO;
 		cosNHdivHO = fmaxf(cosNHdivHO, 1e-5f);
@@ -84,7 +80,7 @@ ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc, co
 		return make_float3(out, out, out);
 	}
 
-	return make_float3(0, 0, 0);
+	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
 ccl_device float3 bsdf_ashikhmin_velvet_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
@@ -118,7 +114,7 @@ ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc, float3 Ng,
 
 			float sinNH2 = 1 - cosNH * cosNH;
 			float sinNH4 = sinNH2 * sinNH2;
-			float cotangent2 =  (cosNH * cosNH) / sinNH2;
+			float cotangent2 = (cosNH * cosNH) / sinNH2;
 
 			float D = expf(-cotangent2 * m_invsigma2) * m_invsigma2 * M_1_PI_F / sinNH4;
 			float G = min(1.0f, min(fac1, fac2)); // TODO: derive G from D analytically
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index 949fe869549..4b29bb096d1 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -43,10 +43,6 @@ ccl_device int bsdf_diffuse_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device void bsdf_diffuse_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float3 N = sc->N;
@@ -90,10 +86,6 @@ ccl_device int bsdf_translucent_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device void bsdf_translucent_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_translucent_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
@@ -108,11 +100,6 @@ ccl_device float3 bsdf_translucent_eval_transmit(const ShaderClosure *sc, const
 	return make_float3 (cos_pi, cos_pi, cos_pi);
 }
 
-ccl_device float bsdf_translucent_albedo(const ShaderClosure *sc, const float3 I)
-{
-	return 1.0f;
-}
-
 ccl_device int bsdf_translucent_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 	float3 N = sc->N;
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index b856774375f..e0287e7655a 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -41,9 +41,9 @@ ccl_device float3 bsdf_diffuse_ramp_get_color(const ShaderClosure *sc, const flo
 	
 	float npos = pos * (float)(MAXCOLORS - 1);
 	int ipos = float_to_int(npos);
-	if (ipos < 0)
+	if(ipos < 0)
 		return colors[0];
-	if (ipos >= (MAXCOLORS - 1))
+	if(ipos >= (MAXCOLORS - 1))
 		return colors[MAXCOLORS - 1];
 	float offset = npos - (float)ipos;
 	return colors[ipos] * (1.0f - offset) + colors[ipos+1] * offset;
@@ -52,7 +52,9 @@ ccl_device float3 bsdf_diffuse_ramp_get_color(const ShaderClosure *sc, const flo
 ccl_device int bsdf_diffuse_ramp_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_DIFFUSE_RAMP_ID;
-	return SD_BSDF | SD_BSDF_HAS_EVAL;
+	sc->data0 = 0.0f;
+	sc->data1 = 0.0f;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_diffuse_ramp_blur(ShaderClosure *sc, float roughness)
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index e0b5454592b..1e81617a7d3 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -36,20 +36,12 @@
 CCL_NAMESPACE_BEGIN
 
 
-ccl_device void bsdf_hair_reflection_blur(ShaderClosure *sc, float roughness)
-{
-}
-
-ccl_device void bsdf_hair_transmission_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device int bsdf_hair_reflection_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_HAIR_REFLECTION_ID;
 	sc->data0 = clamp(sc->data0, 0.001f, 1.0f);
 	sc->data1 = clamp(sc->data1, 0.001f, 1.0f);
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_hair_transmission_setup(ShaderClosure *sc)
@@ -57,31 +49,25 @@ ccl_device int bsdf_hair_transmission_setup(ShaderClosure *sc)
 	sc->type = CLOSURE_BSDF_HAIR_TRANSMISSION_ID;
 	sc->data0 = clamp(sc->data0, 0.001f, 1.0f);
 	sc->data1 = clamp(sc->data1, 0.001f, 1.0f);
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-#ifdef __HAIR__
 	float offset = sc->data2;
 	float3 Tg = sc->T;
-#else
-	float offset = 0.0f;
-	float3 Tg = make_float3(1.0f, 0.0f, 0.0f);
-#endif
 	float roughness1 = sc->data0;
 	float roughness2 = sc->data1;
 
 	float Iz = dot(Tg, I);
 	float3 locy = normalize(I - Tg * Iz);
-	//float3 locx = cross(locy, Tg);
 
-	float theta_r = M_PI_2_F - safe_acosf(Iz);
+	float theta_r = M_PI_2_F - fast_acosf(Iz);
 
 	float omega_in_z = dot(Tg, omega_in);
 	float3 omega_in_y = normalize(omega_in - Tg * omega_in_z);
 
-	float theta_i = M_PI_2_F - safe_acosf(omega_in_z);
+	float theta_i = M_PI_2_F - fast_acosf(omega_in_z);
 	float cosphi_i = dot(omega_in_y, locy);
 
 	if(M_PI_2_F - fabsf(theta_i) < 0.001f || cosphi_i < 0.0f) {
@@ -89,17 +75,19 @@ ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, con
 		return make_float3(*pdf, *pdf, *pdf);
 	}
 
-	float phi_i = safe_acosf(cosphi_i) / roughness2;
+	float roughness1_inv = 1.0f / roughness1;
+	float roughness2_inv = 1.0f / roughness2;
+	float phi_i = fast_acosf(cosphi_i) * roughness2_inv;
 	phi_i = fabsf(phi_i) < M_PI_F ? phi_i : M_PI_F;
-	float costheta_i = cosf(theta_i);
+	float costheta_i = fast_cosf(theta_i);
 
-	float a_R = atan2f(((M_PI_2_F + theta_r) * 0.5f - offset) / roughness1, 1.0f);
-	float b_R = atan2f(((-M_PI_2_F + theta_r) * 0.5f - offset) / roughness1, 1.0f);
+	float a_R = fast_atan2f(((M_PI_2_F + theta_r) * 0.5f - offset) * roughness1_inv, 1.0f);
+	float b_R = fast_atan2f(((-M_PI_2_F + theta_r) * 0.5f - offset) * roughness1_inv, 1.0f);
 
 	float theta_h = (theta_i + theta_r) * 0.5f;
 	float t = theta_h - offset;
 
-	float phi_pdf = cosf(phi_i * 0.5f) * 0.25f / roughness2;
+	float phi_pdf = fast_cosf(phi_i * 0.5f) * 0.25f * roughness2_inv;
 	float theta_pdf = roughness1 / (2 * (t*t + roughness1*roughness1) * (a_R - b_R)* costheta_i);
 	*pdf = phi_pdf * theta_pdf;
 
@@ -119,37 +107,32 @@ ccl_device float3 bsdf_hair_reflection_eval_transmit(const ShaderClosure *sc, co
 
 ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-#ifdef __HAIR__
 	float offset = sc->data2;
 	float3 Tg = sc->T;
-#else
-	float offset = 0.0f;
-	float3 Tg = make_float3(1.0f, 0.0f, 0.0f);
-#endif
 	float roughness1 = sc->data0;
 	float roughness2 = sc->data1;
 	float Iz = dot(Tg, I);
 	float3 locy = normalize(I - Tg * Iz);
-	//float3 locx = cross(locy, Tg);
 
-	float theta_r = M_PI_2_F - safe_acosf(Iz);
+	float theta_r = M_PI_2_F - fast_acosf(Iz);
 
 	float omega_in_z = dot(Tg, omega_in);
 	float3 omega_in_y = normalize(omega_in - Tg * omega_in_z);
 
-	float theta_i = M_PI_2_F - safe_acosf(omega_in_z);
-	float phi_i = safe_acosf(dot(omega_in_y, locy));
+	float theta_i = M_PI_2_F - fast_acosf(omega_in_z);
+	float phi_i = fast_acosf(dot(omega_in_y, locy));
 
 	if(M_PI_2_F - fabsf(theta_i) < 0.001f) {
 		*pdf = 0.0f;
 		return make_float3(*pdf, *pdf, *pdf);
 	}
 
-	float costheta_i = cosf(theta_i);
+	float costheta_i = fast_cosf(theta_i);
 
-	float a_TT = atan2f(((M_PI_2_F + theta_r)/2 - offset) / roughness1, 1.0f);
-	float b_TT = atan2f(((-M_PI_2_F + theta_r)/2 - offset) / roughness1, 1.0f);
-	float c_TT = 2 * atan2f(M_PI_2_F / roughness2, 1.0f);
+	float roughness1_inv = 1.0f / roughness1;
+	float a_TT = fast_atan2f(((M_PI_2_F + theta_r)/2 - offset) * roughness1_inv, 1.0f);
+	float b_TT = fast_atan2f(((-M_PI_2_F + theta_r)/2 - offset) * roughness1_inv, 1.0f);
+	float c_TT = 2 * fast_atan2f(M_PI_2_F / roughness2, 1.0f);
 
 	float theta_h = (theta_i + theta_r) / 2;
 	float t = theta_h - offset;
@@ -165,39 +148,38 @@ ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc,
 
 ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-#ifdef __HAIR__
 	float offset = sc->data2;
 	float3 Tg = sc->T;
-#else
-	float offset = 0.0f;
-	float3 Tg = make_float3(1.0f, 0.0f, 0.0f);
-#endif
 	float roughness1 = sc->data0;
 	float roughness2 = sc->data1;
 	float Iz = dot(Tg, I);
 	float3 locy = normalize(I - Tg * Iz);
 	float3 locx = cross(locy, Tg);
-	float theta_r = M_PI_2_F - safe_acosf(Iz);
+	float theta_r = M_PI_2_F - fast_acosf(Iz);
 
-	float a_R = atan2f(((M_PI_2_F + theta_r) * 0.5f - offset) / roughness1, 1.0f);
-	float b_R = atan2f(((-M_PI_2_F + theta_r) * 0.5f - offset) / roughness1, 1.0f);
+	float roughness1_inv = 1.0f / roughness1;
+	float a_R = fast_atan2f(((M_PI_2_F + theta_r) * 0.5f - offset) * roughness1_inv, 1.0f);
+	float b_R = fast_atan2f(((-M_PI_2_F + theta_r) * 0.5f - offset) * roughness1_inv, 1.0f);
 
 	float t = roughness1 * tanf(randu * (a_R - b_R) + b_R);
 
 	float theta_h = t + offset;
 	float theta_i = 2 * theta_h - theta_r;
-	float costheta_i = cosf(theta_i);
-	float sintheta_i = sinf(theta_i);
+
+	float costheta_i, sintheta_i;
+	fast_sincosf(theta_i, &sintheta_i, &costheta_i);
 
 	float phi = 2 * safe_asinf(1 - 2 * randv) * roughness2;
 
-	float phi_pdf = cosf(phi * 0.5f) * 0.25f / roughness2;
+	float phi_pdf = fast_cosf(phi * 0.5f) * 0.25f / roughness2;
 
 	float theta_pdf = roughness1 / (2 * (t*t + roughness1*roughness1) * (a_R - b_R)*costheta_i);
 
-	*omega_in =(cosf(phi) * costheta_i) * locy -
-			   (sinf(phi) * costheta_i) * locx +
-			   (            sintheta_i) * Tg;
+	float sinphi, cosphi;
+	fast_sincosf(phi, &sinphi, &cosphi);
+	*omega_in =(cosphi * costheta_i) * locy -
+	           (sinphi * costheta_i) * locx +
+	           (         sintheta_i) * Tg;
 
 	//differentials - TODO: find a better approximation for the reflective bounce
 #ifdef __RAY_DIFFERENTIALS__
@@ -211,48 +193,43 @@ ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, f
 
 	*eval = make_float3(*pdf, *pdf, *pdf);
 
-	if(dot(locy, *omega_in) < 0.0f) {
-		return LABEL_REFLECT|LABEL_TRANSMIT|LABEL_GLOSSY;
-	}
-	
 	return LABEL_REFLECT|LABEL_GLOSSY;
 }
 
 ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-#ifdef __HAIR__
 	float offset = sc->data2;
 	float3 Tg = sc->T;
-#else
-	float offset = 0.0f;
-	float3 Tg = make_float3(1.0f, 0.0f, 0.0f);
-#endif
 	float roughness1 = sc->data0;
 	float roughness2 = sc->data1;
 	float Iz = dot(Tg, I);
 	float3 locy = normalize(I - Tg * Iz);
 	float3 locx = cross(locy, Tg);
-	float theta_r = M_PI_2_F - safe_acosf(Iz);
+	float theta_r = M_PI_2_F - fast_acosf(Iz);
 
-	float a_TT = atan2f(((M_PI_2_F + theta_r)/2 - offset) / roughness1, 1.0f);
-	float b_TT = atan2f(((-M_PI_2_F + theta_r)/2 - offset) / roughness1, 1.0f);
-	float c_TT = 2 * atan2f(M_PI_2_F / roughness2, 1.0f);
+	float roughness1_inv = 1.0f / roughness1;
+	float a_TT = fast_atan2f(((M_PI_2_F + theta_r)/2 - offset) * roughness1_inv, 1.0f);
+	float b_TT = fast_atan2f(((-M_PI_2_F + theta_r)/2 - offset) * roughness1_inv, 1.0f);
+	float c_TT = 2 * fast_atan2f(M_PI_2_F / roughness2, 1.0f);
 
 	float t = roughness1 * tanf(randu * (a_TT - b_TT) + b_TT);
 
 	float theta_h = t + offset;
 	float theta_i = 2 * theta_h - theta_r;
-	float costheta_i = cosf(theta_i);
-	float sintheta_i = sinf(theta_i);
+
+	float costheta_i, sintheta_i;
+	fast_sincosf(theta_i, &sintheta_i, &costheta_i);
 
 	float p = roughness2 * tanf(c_TT * (randv - 0.5f));
 	float phi = p + M_PI_F;
 	float theta_pdf = roughness1 / (2 * (t*t + roughness1*roughness1) * (a_TT - b_TT) * costheta_i);
 	float phi_pdf = roughness2 / (c_TT * (p * p + roughness2 * roughness2));
 
-	*omega_in =(cosf(phi) * costheta_i) * locy -
-	           (sinf(phi) * costheta_i) * locx +
-	           (            sintheta_i) * Tg;
+	float sinphi, cosphi;
+	fast_sincosf(phi, &sinphi, &cosphi);
+	*omega_in =(cosphi * costheta_i) * locy -
+	           (sinphi * costheta_i) * locx +
+	           (         sintheta_i) * Tg;
 
 	//differentials - TODO: find a better approximation for the transmission bounce
 #ifdef __RAY_DIFFERENTIALS__
@@ -267,10 +244,9 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng,
 
 	*eval = make_float3(*pdf, *pdf, *pdf);
 
-	if(dot(locy, *omega_in) < 0.0f)
-		return LABEL_TRANSMIT|LABEL_GLOSSY;
-	
-	return LABEL_GLOSSY;
+	kernel_assert(dot(locy, *omega_in) < 0.0f);
+
+	return LABEL_TRANSMIT|LABEL_GLOSSY;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index a0c59e6cbc0..2a0e8f62e7c 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -35,145 +35,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Approximate erf and erfinv implementations
- *
- * Adapted from code (C) Copyright John Maddock 2006.
- * Use, modification and distribution are subject to the
- * Boost Software License, Version 1.0. (See accompanying file
- * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) */
-
-ccl_device float approx_erff_impl(float z)
-{
-	float result;
-
-	if(z < 0.5f) {
-		if(z < 1e-10f) {
-			if(z == 0) {
-				result = 0;
-			}
-			else {
-				float c = 0.0033791670f;
-				result = z * 1.125f + z * c;
-			}
-		}
-		else {
-			float Y = 1.044948577f;
-
-			float zz = z * z;
-			float num = (((-0.007727583f * zz) + -0.050999073f)*zz + -0.338165134f)*zz + 0.083430589f;
-			float denom = (((0.000370900f * zz) + 0.008585719f)*zz + 0.087522260f)*zz + 0.455004033f;
-			result = z * (Y + num / denom);
-		}
-	}
-	else if(z < 2.5f) {
-		if(z < 1.5f) {
-			float Y = 0.4059357643f;
-			float fz = z - 0.5f;
-
-			float num = (((0.088890036f * fz) + 0.191003695f)*fz + 0.178114665f)*fz + -0.098090592f;
-			float denom = (((0.123850974f * fz) + 0.578052804f)*fz + 1.426280048f)*fz + 1.847590709f;
-
-			result = Y + num / denom;
-			result *= expf(-z * z) / z;
-		}
-		else  {
-			float Y = 0.506728172f;
-			float fz = z - 1.5f;
-			float num = (((0.017567943f * fz) + 0.043948189f)*fz + 0.038654037f)*fz + -0.024350047f;
-			float denom = (((0.325732924f * fz) + 0.982403709f)*fz + 1.539914949f)*fz + 1;
-
-			result = Y + num / denom;
-			result *= expf(-z * z) / z;
-		}
-
-		result = 1 - result;
-	}
-	else {
-		result = 1;
-	}
-
-	return result;
-}
-
-ccl_device float approx_erff(float z)
-{
-	float s = 1.0f;
-
-	if(z < 0.0f) {
-		s = -1.0f;
-		z = -z;
-	}
-
-	return s * approx_erff_impl(z);
-}
-
-ccl_device float approx_erfinvf_impl(float p, float q)
-{
-	float result = 0;
-
-	if(p <= 0.5f) {
-		float Y = 0.089131474f;
-		float g = p * (p + 10);
-		float num = (((-0.012692614f * p) + 0.033480662f)*p + -0.008368748f)*p + -0.000508781f;
-		float denom = (((1.562215583f * p) + -1.565745582f)*p + -0.970005043f)*p + 1.0f;
-		float r = num / denom;
-		result = g * Y + g * r;
-	}
-	else if(q >= 0.25f) {
-		float Y = 2.249481201f;
-		float g = sqrtf(-2 * logf(q));
-		float xs = q - 0.25f;
-		float num = (((17.644729840f * xs) + 8.370503283f)*xs + 0.105264680f)*xs + -0.202433508f;
-		float denom = (((-28.660818049f * xs) + 3.971343795f)*xs + 6.242641248f)*xs + 1.0f;
-		float r = num / denom;
-		result = g / (Y + r);
-	}
-	else {
-		float x = sqrtf(-logf(q));
-
-		if(x < 3) {
-			float Y = 0.807220458f;
-			float xs = x - 1.125f;
-			float num = (((0.387079738f * xs) + 0.117030156f)*xs + -0.163794047f)*xs + -0.131102781f;
-			float denom = (((4.778465929f * xs) + 5.381683457f)*xs + 3.466254072f)*xs + 1.0f;
-			float R = num / denom;
-			result = Y * x + R * x;
-		}
-		else {
-			float Y = 0.939955711f;
-			float xs = x - 3;
-			float num = (((0.009508047f * xs) + 0.018557330f)*xs + -0.002224265f)*xs + -0.035035378f;
-			float denom = (((0.220091105f * xs) + 0.762059164f)*xs + 1.365334981f)*xs + 1.0f;
-			float R = num / denom;
-			result = Y * x + R * x;
-		}
-	}
-
-	return result;
-}
-
-ccl_device float approx_erfinvf(float z)
-{
-	float p, q, s;
-
-	if(z < 0) {
-	  p = -z;
-	  q = 1 - p;
-	  s = -1;
-	}
-	else {
-	  p = z;
-	  q = 1 - z;
-	  s = 1;
-	}
-
-	return s * approx_erfinvf_impl(p, q);
-}
-
-/* Beckmann and GGX microfacet importance sampling from:
- * 
- * Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals.
- * E. Heitz and E. d'Eon, EGSR 2014 */
+/* Beckmann and GGX microfacet importance sampling. */
 
 ccl_device_inline void microfacet_beckmann_sample_slopes(
 	KernelGlobals *kg,
@@ -194,64 +56,71 @@ ccl_device_inline void microfacet_beckmann_sample_slopes(
 	/* precomputations */
 	const float tan_theta_i = sin_theta_i/cos_theta_i;
 	const float inv_a = tan_theta_i;
-	const float a = 1.0f/inv_a;
-	const float erf_a = approx_erff(a);
-	const float exp_a2 = expf(-a*a);
+	const float cot_theta_i = 1.0f/tan_theta_i;
+	const float erf_a = fast_erff(cot_theta_i);
+	const float exp_a2 = expf(-cot_theta_i*cot_theta_i);
 	const float SQRT_PI_INV = 0.56418958354f;
 	const float Lambda = 0.5f*(erf_a - 1.0f) + (0.5f*SQRT_PI_INV)*(exp_a2*inv_a);
 	const float G1 = 1.0f/(1.0f + Lambda); /* masking */
 
 	*G1i = G1;
 
-#if 0
-	const float C = 1.0f - G1 * erf_a;
-
-	/* sample slope X */
-	if(randu < C) {
-		/* rescale randu */
-		randu = randu / C;
-		const float w_1 = 0.5f * SQRT_PI_INV * sin_theta_i * exp_a2;
-		const float w_2 = cos_theta_i * (0.5f - 0.5f*erf_a);
-		const float p = w_1 / (w_1 + w_2);
-
-		if(randu < p) {
-			randu = randu / p;
-			*slope_x = -sqrtf(-logf(randu*exp_a2));
-		}
-		else {
-			randu = (randu - p) / (1.0f - p);
-			*slope_x = approx_erfinvf(randu - 1.0f - randu*erf_a);
-		}
+#if defined(__KERNEL_GPU__)
+	/* Based on paper from Wenzel Jakob
+	 * An Improved Visible Normal Sampling Routine for the Beckmann Distribution
+	 *
+	 * http://www.mitsuba-renderer.org/~wenzel/files/visnormal.pdf
+	 *
+	 * Reformulation from OpenShadingLanguage which avoids using inverse
+	 * trigonometric functions.
+	 */
+
+	/* Sample slope X.
+	 *
+	 * Compute a coarse approximation using the approximation:
+	 *   exp(-ierf(x)^2) ~= 1 - x * x
+	 *   solve y = 1 + b + K * (1 - b * b)
+	 */
+	float K = tan_theta_i * SQRT_PI_INV;
+	float y_approx = randu * (1.0f + erf_a + K * (1 - erf_a * erf_a));
+	float y_exact  = randu * (1.0f + erf_a + K * exp_a2);
+	float b = K > 0 ? (0.5f - sqrtf(K * (K - y_approx + 1.0f) + 0.25f)) / K : y_approx - 1.0f;
+
+	/* Perform newton step to refine toward the true root. */
+	float inv_erf = fast_ierff(b);
+	float value  = 1.0f + b + K * expf(-inv_erf * inv_erf) - y_exact;
+	/* Check if we are close enough already,
+	 * this also avoids NaNs as we get close to the root.
+	 */
+	if(fabsf(value) > 1e-6f) {
+		b -= value / (1.0f - inv_erf * tan_theta_i); /* newton step 1. */
+		inv_erf = fast_ierff(b);
+		value  = 1.0f + b + K * expf(-inv_erf * inv_erf) - y_exact;
+		b -= value / (1.0f - inv_erf * tan_theta_i); /* newton step 2. */
+		/* Compute the slope from the refined value. */
+		*slope_x = fast_ierff(b);
 	}
 	else {
-		/* rescale randu */
-		randu = (randu - C) / (1.0f - C);
-		*slope_x = approx_erfinvf((-1.0f + 2.0f*randu)*erf_a);
-
-		const float p = (-(*slope_x)*sin_theta_i + cos_theta_i) / (2.0f*cos_theta_i);
-
-		if(randv > p) {
-			*slope_x = -(*slope_x);
-			randv = (randv - p) / (1.0f - p);
-		}
-		else
-			randv = randv / p;
+		/* We are close enough already. */
+		*slope_x = inv_erf;
 	}
-
-	/* sample slope Y */
-	*slope_y = approx_erfinvf(2.0f*randv - 1.0f);
+	*slope_y = fast_ierff(2.0f*randv - 1.0f);
 #else
-	/* use precomputed table, because it better preserves stratification
-	 * of the random number pattern */
+	/* Use precomputed table on CPU, it gives better perfomance. */
 	int beckmann_table_offset = kernel_data.tables.beckmann_offset;
 
 	*slope_x = lookup_table_read_2D(kg, randu, cos_theta_i,
 		beckmann_table_offset, BECKMANN_TABLE_SIZE, BECKMANN_TABLE_SIZE);
-	*slope_y = approx_erfinvf(2.0f*randv - 1.0f);
+	*slope_y = fast_ierff(2.0f*randv - 1.0f);
 #endif
-
 }
 
+/* GGX microfacet importance sampling from:
+ *
+ * Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals.
+ * E. Heitz and E. d'Eon, EGSR 2014
+ */
+
 ccl_device_inline void microfacet_ggx_sample_slopes(
 	const float cos_theta_i, const float sin_theta_i,
 	float randu, float randv, float *slope_x, float *slope_y,
@@ -366,32 +235,32 @@ ccl_device_inline float3 microfacet_sample_stretched(
 
 ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
 	sc->data1 = sc->data0; /* alpha_y */
 	
 	sc->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
 
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
-	sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
+	sc->data1 = saturate(sc->data1); /* alpha_y */
 	
 	sc->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
 
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
 	sc->data1 = sc->data0; /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_microfacet_ggx_blur(ShaderClosure *sc, float roughness)
@@ -404,11 +273,11 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 {
 	float alpha_x = sc->data0;
 	float alpha_y = sc->data1;
-	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
+	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 	float3 N = sc->N;
 
 	if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
@@ -487,7 +356,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 		return make_float3(out, out, out);
 	}
 
-	return make_float3(0, 0, 0);
+	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
 ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
@@ -495,17 +364,17 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, con
 	float alpha_x = sc->data0;
 	float alpha_y = sc->data1;
 	float m_eta = sc->data2;
-	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
+	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 	float3 N = sc->N;
 
 	if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
 
 	if(cosNO <= 0 || cosNI >= 0)
-		return make_float3(0, 0, 0); /* vectors on same side -- not possible */
+		return make_float3(0.0f, 0.0f, 0.0f); /* vectors on same side -- not possible */
 
 	/* compute half-vector of the refraction (eq. 16) */
 	float3 ht = -(m_eta * omega_in + I);
@@ -513,10 +382,6 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, con
 	float cosHO = dot(Ht, I);
 	float cosHI = dot(Ht, omega_in);
 
-	/* those situations makes chi+ terms in eq. 33, 34 be zero */
-	if(dot(Ht, N) <= 0.0f || cosHO * cosNO <= 0.0f || cosHI * cosNI <= 0.0f)
-		return make_float3(0.0f, 0.0f, 0.0f);
-
 	float D, G1o, G1i;
 
 	/* eq. 33: first we calculate D(m) with m=Ht: */
@@ -543,7 +408,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, con
 	 * pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2 */
 	float common = D * (m_eta * m_eta) / (cosNO * Ht2);
 	float out = G * fabsf(cosHI * cosHO) * common;
-	*pdf = G1o * cosHO * fabsf(cosHI) * common;
+	*pdf = G1o * fabsf(cosHO * cosHI) * common;
 
 	return make_float3(out, out, out);
 }
@@ -552,7 +417,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 {
 	float alpha_x = sc->data0;
 	float alpha_y = sc->data1;
-	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
+	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 	float3 N = sc->N;
 
 	float cosNO = dot(N, I);
@@ -657,16 +522,16 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 #ifdef __RAY_DIFFERENTIALS__
 			float3 dRdx, dRdy, dTdx, dTdy;
 #endif
-			float m_eta = sc->data2;
+			float m_eta = sc->data2, fresnel;
 			bool inside;
 
-			fresnel_dielectric(m_eta, m, I, &R, &T,
+			fresnel = fresnel_dielectric(m_eta, m, I, &R, &T,
 #ifdef __RAY_DIFFERENTIALS__
 				dIdx, dIdy, &dRdx, &dRdy, &dTdx, &dTdy,
 #endif
 				&inside);
 			
-			if(!inside) {
+			if(!inside && fresnel != 1.0f) {
 
 				*omega_in = T;
 #ifdef __RAY_DIFFERENTIALS__
@@ -719,29 +584,29 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 
 ccl_device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
 	sc->data1 = sc->data0; /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ID;
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
-	sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
+	sc->data1 = saturate(sc->data1); /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID;
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data0 = saturate(sc->data0); /* alpha_x */
 	sc->data1 = sc->data0; /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_microfacet_beckmann_blur(ShaderClosure *sc, float roughness)
@@ -754,11 +619,11 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc,
 {
 	float alpha_x = sc->data0;
 	float alpha_y = sc->data1;
-	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
+	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 	float3 N = sc->N;
 
 	if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
@@ -840,7 +705,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc,
 		return make_float3(out, out, out);
 	}
 
-	return make_float3(0, 0, 0);
+	return make_float3(0.0f, 0.0f, 0.0f);
 }
 
 ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
@@ -848,17 +713,17 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc
 	float alpha_x = sc->data0;
 	float alpha_y = sc->data1;
 	float m_eta = sc->data2;
-	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
+	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 	float3 N = sc->N;
 
 	if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
 
 	if(cosNO <= 0 || cosNI >= 0)
-		return make_float3(0, 0, 0);
+		return make_float3(0.0f, 0.0f, 0.0f);
 
 	/* compute half-vector of the refraction (eq. 16) */
 	float3 ht = -(m_eta * omega_in + I);
@@ -866,10 +731,6 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc
 	float cosHO = dot(Ht, I);
 	float cosHI = dot(Ht, omega_in);
 
-	/* those situations makes chi+ terms in eq. 25, 27 be zero */
-	if(dot(Ht, N) <= 0.0f || cosHO * cosNO <= 0.0f || cosHI * cosNI <= 0.0f)
-		return make_float3(0.0f, 0.0f, 0.0f);
-
 	/* eq. 25: first we calculate D(m) with m=Ht: */
 	float alpha2 = alpha_x * alpha_y;
 	float cosThetaM = min(dot(N, Ht), 1.0f);
@@ -895,7 +756,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc
 	 * pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2 */
 	float common = D * (m_eta * m_eta) / (cosNO * Ht2);
 	float out = G * fabsf(cosHI * cosHO) * common;
-	*pdf = G1o * cosHO * fabsf(cosHI) * common;
+	*pdf = G1o * fabsf(cosHO * cosHI) * common;
 
 	return make_float3(out, out, out);
 }
@@ -904,7 +765,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 {
 	float alpha_x = sc->data0;
 	float alpha_y = sc->data1;
-	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
+	bool m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 	float3 N = sc->N;
 
 	float cosNO = dot(N, I);
@@ -1011,16 +872,16 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 #ifdef __RAY_DIFFERENTIALS__
 			float3 dRdx, dRdy, dTdx, dTdy;
 #endif
-			float m_eta = sc->data2;
+			float m_eta = sc->data2, fresnel;
 			bool inside;
 
-			fresnel_dielectric(m_eta, m, I, &R, &T,
+			fresnel = fresnel_dielectric(m_eta, m, I, &R, &T,
 #ifdef __RAY_DIFFERENTIALS__
 				dIdx, dIdy, &dRdx, &dRdy, &dTdx, &dTdy,
 #endif
 				&inside);
 
-			if(!inside) {
+			if(!inside && fresnel != 1.0f) {
 				*omega_in = T;
 
 #ifdef __RAY_DIFFERENTIALS__
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index 6f685d5eeea..61b7cb11b02 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __BSDF_OREN_NAYAR_H__
@@ -25,7 +25,7 @@ ccl_device float3 bsdf_oren_nayar_get_intensity(const ShaderClosure *sc, float3
 	float nv = max(dot(n, v), 0.0f);
 	float t = dot(l, v) - nl * nv;
 
-	if (t > 0.0f)
+	if(t > 0.0f)
 		t /= max(nl, nv) + FLT_MIN;
 	float is = nl * (sc->data0 + sc->data1 * t);
 	return make_float3(is, is, is);
@@ -37,23 +37,19 @@ ccl_device int bsdf_oren_nayar_setup(ShaderClosure *sc)
 
 	sc->type = CLOSURE_BSDF_OREN_NAYAR_ID;
 
-	sigma = clamp(sigma, 0.0f, 1.0f);
+	sigma = saturate(sigma);
 
 	float div = 1.0f / (M_PI_F + ((3.0f * M_PI_F - 4.0f) / 6.0f) * sigma);
 
 	sc->data0 = 1.0f * div;
 	sc->data1 = sigma * div;
 
-	return SD_BSDF | SD_BSDF_HAS_EVAL;
-}
-
-ccl_device void bsdf_oren_nayar_blur(ShaderClosure *sc, float roughness)
-{
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device float3 bsdf_oren_nayar_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	if (dot(sc->N, omega_in) > 0.0f) {
+	if(dot(sc->N, omega_in) > 0.0f) {
 		*pdf = 0.5f * M_1_PI_F;
 		return bsdf_oren_nayar_get_intensity(sc, sc->N, I, omega_in);
 	}
@@ -72,7 +68,7 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc, float3 Ng, float3
 {
 	sample_uniform_hemisphere(sc->N, randu, randv, omega_in, pdf);
 
-	if (dot(Ng, *omega_in) > 0.0f) {
+	if(dot(Ng, *omega_in) > 0.0f) {
 		*eval = bsdf_oren_nayar_get_intensity(sc, sc->N, I, *omega_in);
 
 #ifdef __RAY_DIFFERENTIALS__
@@ -86,7 +82,7 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc, float3 Ng, float3
 		*eval = make_float3(0.0f, 0.0f, 0.0f);
 	}
 
-	return LABEL_REFLECT | LABEL_DIFFUSE;
+	return LABEL_REFLECT|LABEL_DIFFUSE;
 }
 
 
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index 2b4e1c68640..1ab15eee954 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -41,9 +41,9 @@ ccl_device float3 bsdf_phong_ramp_get_color(const ShaderClosure *sc, const float
 	
 	float npos = pos * (float)(MAXCOLORS - 1);
 	int ipos = float_to_int(npos);
-	if (ipos < 0)
+	if(ipos < 0)
 		return colors[0];
-	if (ipos >= (MAXCOLORS - 1))
+	if(ipos >= (MAXCOLORS - 1))
 		return colors[MAXCOLORS - 1];
 	float offset = npos - (float)ipos;
 	return colors[ipos] * (1.0f - offset) + colors[ipos+1] * offset;
@@ -51,10 +51,10 @@ ccl_device float3 bsdf_phong_ramp_get_color(const ShaderClosure *sc, const float
 
 ccl_device int bsdf_phong_ramp_setup(ShaderClosure *sc)
 {
-	sc->data0 = max(sc->data0, 0.0f);
-	
 	sc->type = CLOSURE_BSDF_PHONG_RAMP_ID;
-	return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY;
+	sc->data0 = max(sc->data0, 0.0f);
+	sc->data1 = 0.0f;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
 ccl_device void bsdf_phong_ramp_blur(ShaderClosure *sc, float roughness)
@@ -67,11 +67,11 @@ ccl_device float3 bsdf_phong_ramp_eval_reflect(const ShaderClosure *sc, const fl
 	float cosNI = dot(sc->N, omega_in);
 	float cosNO = dot(sc->N, I);
 	
-	if (cosNI > 0 && cosNO > 0) {
+	if(cosNI > 0 && cosNO > 0) {
 		// reflect the view vector
 		float3 R = (2 * cosNO) * sc->N - I;
 		float cosRI = dot(R, omega_in);
-		if (cosRI > 0) {
+		if(cosRI > 0) {
 			float cosp = powf(cosRI, m_exponent);
 			float common = 0.5f * M_1_PI_F * cosp;
 			float out = cosNI * (m_exponent + 2) * common;
@@ -93,7 +93,7 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, const float3 colo
 	float cosNO = dot(sc->N, I);
 	float m_exponent = sc->data0;
 	
-	if (cosNO > 0) {
+	if(cosNO > 0) {
 		// reflect the view vector
 		float3 R = (2 * cosNO) * sc->N - I;
 
@@ -111,12 +111,12 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc, const float3 colo
 		*omega_in = (cosf(phi) * sinTheta) * T +
 		            (sinf(phi) * sinTheta) * B +
 		            (            cosTheta) * R;
-		if (dot(Ng, *omega_in) > 0.0f)
+		if(dot(Ng, *omega_in) > 0.0f)
 		{
 			// common terms for pdf and eval
 			float cosNI = dot(sc->N, *omega_in);
 			// make sure the direction we chose is still in the right hemisphere
-			if (cosNI > 0)
+			if(cosNI > 0)
 			{
 				float cosp = powf(cosTheta, m_exponent);
 				float common = 0.5f * M_1_PI_F * cosp;
diff --git a/intern/cycles/kernel/closure/bsdf_reflection.h b/intern/cycles/kernel/closure/bsdf_reflection.h
index 0baccdf155c..303f4c9ce34 100644
--- a/intern/cycles/kernel/closure/bsdf_reflection.h
+++ b/intern/cycles/kernel/closure/bsdf_reflection.h
@@ -43,10 +43,6 @@ ccl_device int bsdf_reflection_setup(ShaderClosure *sc)
 	return SD_BSDF;
 }
 
-ccl_device void bsdf_reflection_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
@@ -70,8 +66,9 @@ ccl_device int bsdf_reflection_sample(const ShaderClosure *sc, float3 Ng, float3
 			*domega_in_dx = 2 * dot(N, dIdx) * N - dIdx;
 			*domega_in_dy = 2 * dot(N, dIdy) * N - dIdy;
 #endif
-			*pdf = 1;
-			*eval = make_float3(1, 1, 1);
+			/* Some high number for MIS. */
+			*pdf = 1e6f;
+			*eval = make_float3(1e6f, 1e6f, 1e6f);
 		}
 	}
 	return LABEL_REFLECT|LABEL_SINGULAR;
diff --git a/intern/cycles/kernel/closure/bsdf_refraction.h b/intern/cycles/kernel/closure/bsdf_refraction.h
index c4698b42060..c78a4b67134 100644
--- a/intern/cycles/kernel/closure/bsdf_refraction.h
+++ b/intern/cycles/kernel/closure/bsdf_refraction.h
@@ -43,10 +43,6 @@ ccl_device int bsdf_refraction_setup(ShaderClosure *sc)
 	return SD_BSDF;
 }
 
-ccl_device void bsdf_refraction_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_refraction_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
@@ -67,15 +63,17 @@ ccl_device int bsdf_refraction_sample(const ShaderClosure *sc, float3 Ng, float3
 	float3 dRdx, dRdy, dTdx, dTdy;
 #endif
 	bool inside;
-	fresnel_dielectric(m_eta, N, I, &R, &T,
+	float fresnel;
+	fresnel = fresnel_dielectric(m_eta, N, I, &R, &T,
 #ifdef __RAY_DIFFERENTIALS__
 		dIdx, dIdy, &dRdx, &dRdy, &dTdx, &dTdy,
 #endif
 		&inside);
-	
-	if(!inside) {
-		*pdf = 1.0f;
-		*eval = make_float3(1.0f, 1.0f, 1.0f);
+
+	if(!inside && fresnel != 1.0f) {
+		/* Some high number for MIS. */
+		*pdf = 1e6f;
+		*eval = make_float3(1e6f, 1e6f, 1e6f);
 		*omega_in = T;
 #ifdef __RAY_DIFFERENTIALS__
 		*domega_in_dx = dTdx;
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index 797fa4227ae..e5b6ab93a64 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -40,16 +40,12 @@ CCL_NAMESPACE_BEGIN
 ccl_device int bsdf_diffuse_toon_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_DIFFUSE_TOON_ID;
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f);
-	sc->data1 = clamp(sc->data1, 0.0f, 1.0f);
+	sc->data0 = saturate(sc->data0);
+	sc->data1 = saturate(sc->data1);
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device void bsdf_diffuse_toon_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_toon_get_intensity(float max_angle, float smooth, float angle)
 {
 	float is;
@@ -124,16 +120,12 @@ ccl_device int bsdf_diffuse_toon_sample(const ShaderClosure *sc, float3 Ng, floa
 ccl_device int bsdf_glossy_toon_setup(ShaderClosure *sc)
 {
 	sc->type = CLOSURE_BSDF_GLOSSY_TOON_ID;
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f);
-	sc->data1 = clamp(sc->data1, 0.0f, 1.0f);
+	sc->data0 = saturate(sc->data0);
+	sc->data1 = saturate(sc->data1);
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
-ccl_device void bsdf_glossy_toon_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_glossy_toon_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	float max_angle = sc->data0*M_PI_2_F;
diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h
index 73601d20c3a..3c2fd8004df 100644
--- a/intern/cycles/kernel/closure/bsdf_transparent.h
+++ b/intern/cycles/kernel/closure/bsdf_transparent.h
@@ -41,10 +41,6 @@ ccl_device int bsdf_transparent_setup(ShaderClosure *sc)
 	return SD_BSDF|SD_TRANSPARENT;
 }
 
-ccl_device void bsdf_transparent_blur(ShaderClosure *sc, float roughness)
-{
-}
-
 ccl_device float3 bsdf_transparent_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 	return make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/closure/bsdf_westin.h b/intern/cycles/kernel/closure/bsdf_westin.h
deleted file mode 100644
index 9dc1c00bb3d..00000000000
--- a/intern/cycles/kernel/closure/bsdf_westin.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Adapted from Open Shading Language with this license:
- *
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- * * Neither the name of Sony Pictures Imageworks nor the names of its
- *   contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __BSDF_WESTIN_H__
-#define __BSDF_WESTIN_H__
-
-CCL_NAMESPACE_BEGIN
-
-/* WESTIN BACKSCATTER */
-
-ccl_device int bsdf_westin_backscatter_setup(ShaderClosure *sc)
-{
-	float roughness = sc->data0;
-	roughness = clamp(roughness, 1e-5f, 1.0f);
-	float m_invroughness = 1.0f/roughness;
-
-	sc->type = CLOSURE_BSDF_WESTIN_BACKSCATTER_ID;
-	sc->data0 = m_invroughness;
-
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
-}
-
-ccl_device void bsdf_westin_backscatter_blur(ShaderClosure *sc, float roughness)
-{
-	float m_invroughness = sc->data0;
-	m_invroughness = min(1.0f/roughness, m_invroughness);
-	sc->data0 = m_invroughness;
-}
-
-ccl_device float3 bsdf_westin_backscatter_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
-	float m_invroughness = sc->data0;
-	float3 N = sc->N;
-
-	// pdf is implicitly 0 (no indirect sampling)
-	float cosNO = dot(N, I);
-	float cosNI = dot(N, omega_in);
-	if(cosNO > 0 && cosNI > 0) {
-		float cosine = dot(I, omega_in);
-		*pdf = cosine > 0 ? (m_invroughness + 1) * powf(cosine, m_invroughness) : 0;
-		*pdf *= 0.5f * M_1_PI_F;
-		return make_float3 (*pdf, *pdf, *pdf);
-	}
-	return make_float3 (0, 0, 0);
-}
-
-ccl_device float3 bsdf_westin_backscatter_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
-	return make_float3(0.0f, 0.0f, 0.0f);
-}
-
-ccl_device int bsdf_westin_backscatter_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
-{
-	float m_invroughness = sc->data0;
-	float3 N = sc->N;
-
-	float cosNO = dot(N, I);
-	if(cosNO > 0) {
-#ifdef __RAY_DIFFERENTIALS__
-		*domega_in_dx = dIdx;
-		*domega_in_dy = dIdy;
-#endif
-		float3 T, B;
-		make_orthonormals (I, &T, &B);
-		float phi = M_2PI_F * randu;
-		float cosTheta = powf(randv, 1 / (m_invroughness + 1));
-		float sinTheta2 = 1 - cosTheta * cosTheta;
-		float sinTheta = sinTheta2 > 0 ? sqrtf(sinTheta2) : 0;
-		*omega_in = (cosf(phi) * sinTheta) * T +
-		            (sinf(phi) * sinTheta) * B +
-		            (cosTheta) * I;
-		if(dot(Ng, *omega_in) > 0) {
-			// common terms for pdf and eval
-			float cosNI = dot(N, *omega_in);
-			// make sure the direction we chose is still in the right hemisphere
-			if(cosNI > 0)
-			{
-				*pdf = 0.5f * M_1_PI_F * powf(cosTheta, m_invroughness);
-				*pdf = (m_invroughness + 1) * (*pdf);
-				*eval = make_float3(*pdf, *pdf, *pdf);
-			}
-		}
-	}
-	return LABEL_REFLECT|LABEL_GLOSSY;
-}
-
-/* WESTIN SHEEN */
-
-ccl_device int bsdf_westin_sheen_setup(ShaderClosure *sc)
-{
-	/* float edginess = sc->data0; */
-	sc->type = CLOSURE_BSDF_WESTIN_SHEEN_ID;
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
-}
-
-ccl_device void bsdf_westin_sheen_blur(ShaderClosure *sc, float roughness)
-{
-}
-
-ccl_device float3 bsdf_westin_sheen_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
-	float m_edginess = sc->data0;
-	float3 N = sc->N;
-
-	// pdf is implicitly 0 (no indirect sampling)
-	float cosNO = dot(N, I);
-	float cosNI = dot(N, omega_in);
-	if(cosNO > 0 && cosNI > 0) {
-		float sinNO2 = 1 - cosNO * cosNO;
-		*pdf = cosNI * M_1_PI_F;
-		float westin = sinNO2 > 0 ? powf(sinNO2, 0.5f * m_edginess) * (*pdf) : 0;
-		return make_float3 (westin, westin, westin);
-	}
-	return make_float3 (0, 0, 0);
-}
-
-ccl_device float3 bsdf_westin_sheen_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
-	return make_float3(0.0f, 0.0f, 0.0f);
-}
-
-ccl_device int bsdf_westin_sheen_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
-{
-	float m_edginess = sc->data0;
-	float3 N = sc->N;
-
-	// we are viewing the surface from the right side - send a ray out with cosine
-	// distribution over the hemisphere
-	sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
-	if(dot(Ng, *omega_in) > 0) {
-		// TODO: account for sheen when sampling
-		float cosNO = dot(N, I);
-		float sinNO2 = 1 - cosNO * cosNO;
-		float westin = sinNO2 > 0 ? powf(sinNO2, 0.5f * m_edginess) * (*pdf) : 0;
-		*eval = make_float3(westin, westin, westin);
-#ifdef __RAY_DIFFERENTIALS__
-		// TODO: find a better approximation for the diffuse bounce
-		*domega_in_dx = (2 * dot(N, dIdx)) * N - dIdx;
-		*domega_in_dy = (2 * dot(N, dIdy)) * N - dIdy;
-#endif
-	}
-	else {
-		pdf = 0;
-	}
-	return LABEL_REFLECT|LABEL_DIFFUSE;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __BSDF_WESTIN_H__ */
-
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index 3849dedc3b6..f817dcd5f2d 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __KERNEL_BSSRDF_H__
@@ -30,8 +30,8 @@ ccl_device int bssrdf_setup(ShaderClosure *sc, ClosureType type)
 		return flag;
 	}
 	else {
-		sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* texture blur */
-		sc->T.x = clamp(sc->T.x, 0.0f, 1.0f); /* sharpness */
+		sc->data1 = saturate(sc->data1); /* texture blur */
+		sc->T.x = saturate(sc->T.x); /* sharpness */
 		sc->type = type;
 
 		return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF;
@@ -157,7 +157,7 @@ ccl_device float bssrdf_cubic_quintic_root_find(float xi)
 	float x = 0.25f;
 	int i;
 
-	for (i = 0; i < max_iteration_count; i++) {
+	for(i = 0; i < max_iteration_count; i++) {
 		float x2 = x*x;
 		float x3 = x2*x;
 		float nx = (1.0f - x);
@@ -168,7 +168,7 @@ ccl_device float bssrdf_cubic_quintic_root_find(float xi)
 		if(fabsf(f) < tolerance || f_ == 0.0f)
 			break;
 
-		x = clamp(x - f/f_, 0.0f, 1.0f);
+		x = saturate(x - f/f_);
 	}
 
 	return x;
diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h
index 058c4b8408f..4d71ba50ec3 100644
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __VOLUME_H__
@@ -26,9 +26,6 @@ CCL_NAMESPACE_BEGIN
  * uniform sphere. g=0 uniform diffuse-like, g=1 close to sharp single ray. */
 ccl_device float single_peaked_henyey_greenstein(float cos_theta, float g)
 {
-	if(fabsf(g) < 1e-3f)
-		return M_1_PI_F * 0.25f;
-	
 	return ((1.0f - g * g) / safe_powf(1.0f + g * g - 2.0f * g * cos_theta, 1.5f)) * (M_1_PI_F * 0.25f);
 };
 
@@ -39,7 +36,7 @@ ccl_device int volume_henyey_greenstein_setup(ShaderClosure *sc)
 	/* clamp anisotropy to avoid delta function */
 	sc->data0 = signf(sc->data0) * min(fabsf(sc->data0), 1.0f - 1e-3f);
 
-	return SD_SCATTER|SD_PHASE_HAS_EVAL;
+	return SD_SCATTER;
 }
 
 ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc, const float3 I, float3 omega_in, float *pdf)
@@ -47,9 +44,13 @@ ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc, c
 	float g = sc->data0;
 
 	/* note that I points towards the viewer */
-	float cos_theta = dot(-I, omega_in);
-
-	*pdf = single_peaked_henyey_greenstein(cos_theta, g);
+	if(fabsf(g) < 1e-3f) {
+		*pdf = M_1_PI_F * 0.25f;
+	}
+	else {
+		float cos_theta = dot(-I, omega_in);
+		*pdf = single_peaked_henyey_greenstein(cos_theta, g);
+	}
 
 	return make_float3(*pdf, *pdf, *pdf);
 }
@@ -63,10 +64,12 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc, float3 I
 	/* match pdf for small g */
 	if(fabsf(g) < 1e-3f) {
 		cos_theta = (1.0f - 2.0f * randu);
+		*pdf = M_1_PI_F * 0.25f;
 	}
 	else {
 		float k = (1.0f - g * g) / (1.0f - g + 2.0f * g * randu);
 		cos_theta = (1.0f + g * g - k * k) / (2.0f * g);
+		*pdf = single_peaked_henyey_greenstein(cos_theta, g);
 	}
 
 	float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
@@ -80,7 +83,6 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc, float3 I
 	make_orthonormals(-I, &T, &B);
 	*omega_in = sin_theta * cos_phi * T + sin_theta * sin_phi * B + cos_theta * (-I);
 
-	*pdf = single_peaked_henyey_greenstein(cos_theta, g);
 	*eval = make_float3(*pdf, *pdf, *pdf); /* perfect importance sampling */
 
 #ifdef __RAY_DIFFERENTIALS__
@@ -105,18 +107,9 @@ ccl_device int volume_absorption_setup(ShaderClosure *sc)
 
 ccl_device float3 volume_phase_eval(const ShaderData *sd, const ShaderClosure *sc, float3 omega_in, float *pdf)
 {
-	float3 eval;
-
-	switch(sc->type) {
-		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-			eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
-			break;
-		default:
-			eval = make_float3(0.0f, 0.0f, 0.0f);
-			break;
-	}
+	kernel_assert(sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID);
 
-	return eval;
+	return volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
 }
 
 ccl_device int volume_phase_sample(const ShaderData *sd, const ShaderClosure *sc, float randu,
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index 9495a2541f9..5ab900d47aa 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -20,7 +20,11 @@
 
 /* 64 object BVH + 64 mesh BVH + 64 object node splitting */
 #define BVH_STACK_SIZE 192
+#define BVH_QSTACK_SIZE 384
 #define BVH_NODE_SIZE 4
+#define BVH_NODE_LEAF_SIZE 1
+#define BVH_QNODE_SIZE 7
+#define BVH_QNODE_LEAF_SIZE 1
 #define TRI_NODE_SIZE 3
 
 /* silly workaround for float extended precision that happens when compiling
@@ -35,6 +39,7 @@
 #include "geom_attribute.h"
 #include "geom_object.h"
 #include "geom_triangle.h"
+#include "geom_triangle_intersect.h"
 #include "geom_motion_triangle.h"
 #include "geom_motion_curve.h"
 #include "geom_curve.h"
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index 63ce31c492f..c7364e9edac 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -29,24 +29,27 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeElement *elem)
 {
-	if(sd->object == PRIM_NONE)
+	if(ccl_fetch(sd, object) == PRIM_NONE)
 		return (int)ATTR_STD_NOT_FOUND;
 
 	/* for SVM, find attribute by unique id */
-	uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
+	uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
 #ifdef __HAIR__
-	attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
+	attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
 #endif
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	
 	while(attr_map.x != id) {
+		if(UNLIKELY(attr_map.x == ATTR_STD_NONE)) {
+			return ATTR_STD_NOT_FOUND;
+		}
 		attr_offset += ATTR_PRIM_TYPES;
 		attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	}
 
 	*elem = (AttributeElement)attr_map.y;
 	
-	if(sd->prim == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
+	if(ccl_fetch(sd, prim) == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
 		return ATTR_STD_NOT_FOUND;
 
 	/* return result */
diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h
index dd6abe32fec..3d0d406dd0b 100644
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@@ -28,6 +28,13 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* Don't inline intersect functions on GPU, this is faster */
+#ifdef __KERNEL_GPU__
+#define ccl_device_intersect ccl_device_noinline
+#else
+#define ccl_device_intersect ccl_device_inline
+#endif
+
 /* BVH intersection function variations */
 
 #define BVH_INSTANCING			1
@@ -35,6 +42,19 @@ CCL_NAMESPACE_BEGIN
 #define BVH_HAIR				4
 #define BVH_HAIR_MINIMUM_WIDTH	8
 
+#define BVH_NAME_JOIN(x,y) x ## _ ## y
+#define BVH_NAME_EVAL(x,y) BVH_NAME_JOIN(x,y)
+#define BVH_FUNCTION_FULL_NAME(prefix) BVH_NAME_EVAL(prefix, BVH_FUNCTION_NAME)
+
+#define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
+
+/* Common QBVH functions. */
+#ifdef __QBVH__
+#include "geom_qbvh.h"
+#endif
+
+/* Regular BVH traversal */
+
 #define BVH_FUNCTION_NAME bvh_intersect
 #define BVH_FUNCTION_FEATURES 0
 #include "geom_bvh_traversal.h"
@@ -63,6 +83,8 @@ CCL_NAMESPACE_BEGIN
 #include "geom_bvh_traversal.h"
 #endif
 
+/* Subsurface scattering BVH traversal */
+
 #if defined(__SUBSURFACE__)
 #define BVH_FUNCTION_NAME bvh_intersect_subsurface
 #define BVH_FUNCTION_FEATURES 0
@@ -93,43 +115,108 @@ CCL_NAMESPACE_BEGIN
 #include "geom_bvh_subsurface.h"
 #endif
 
+/* Volume BVH traversal */
+
+#if defined(__VOLUME__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume
+#define BVH_FUNCTION_FEATURES 0
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_hair
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_hair_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
+#include "geom_bvh_volume.h"
+#endif
+
+/* Record all intersections - Shadow BVH traversal */
+
 #if defined(__SHADOW_RECORD_ALL__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all
 #define BVH_FUNCTION_FEATURES 0
 #include "geom_bvh_shadow.h"
 #endif
 
-#if defined(__SUBSURFACE__) && defined(__INSTANCING__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__INSTANCING__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING
 #include "geom_bvh_shadow.h"
 #endif
 
-#if defined(__SUBSURFACE__) && defined(__HAIR__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
 #include "geom_bvh_shadow.h"
 #endif
 
-#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__OBJECT_MOTION__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
 #include "geom_bvh_shadow.h"
 #endif
 
-#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
 #include "geom_bvh_shadow.h"
 #endif
 
-/* to work around titan bug when using arrays instead of textures */
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
+/* Record all intersections - Volume BVH traversal  */
+
+#if defined(__VOLUME_RECORD_ALL__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all
+#define BVH_FUNCTION_FEATURES 0
+#include "geom_bvh_volume_all.h"
+#endif
+
+#if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#include "geom_bvh_volume_all.h"
+#endif
+
+#if defined(__VOLUME_RECORD_ALL__) && defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all_hair
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
+#include "geom_bvh_volume_all.h"
 #endif
-bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect,
+
+#if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#include "geom_bvh_volume_all.h"
+#endif
+
+#if defined(__VOLUME_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_all_hair_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
+#include "geom_bvh_volume_all.h"
+#endif
+
+#undef BVH_FEATURE
+#undef BVH_NAME_JOIN
+#undef BVH_NAME_EVAL
+#undef BVH_FUNCTION_FULL_NAME
+
+ccl_device_intersect bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect,
 					 uint *lcg_state, float difl, float extmax)
 {
 #ifdef __OBJECT_MOTION__
@@ -167,14 +254,8 @@ bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, I
 #endif /* __KERNEL_CPU__ */
 }
 
-/* to work around titan bug when using arrays instead of textures */
 #ifdef __SUBSURFACE__
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
+ccl_device_intersect uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
 {
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
@@ -212,14 +293,8 @@ uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection
 }
 #endif
 
-/* to work around titan bug when using arrays instead of textures */
 #ifdef __SHADOW_RECORD_ALL__
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
 {
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
@@ -237,26 +312,87 @@ uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection
 		return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
 #endif /* __HAIR__ */
 
-#ifdef __KERNEL_CPU__
-
 #ifdef __INSTANCING__
 	if(kernel_data.bvh.have_instancing)
 		return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
 #endif /* __INSTANCING__ */
 
 	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+}
+#endif
+
+#ifdef __VOLUME__
+ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
+                            const Ray *ray,
+                            Intersection *isect)
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_volume_hair_motion(kg, ray, isect);
+#endif /* __HAIR__ */
+
+		return bvh_intersect_volume_motion(kg, ray, isect);
+	}
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_volume_hair(kg, ray, isect);
+#endif /* __HAIR__ */
+
+#ifdef __KERNEL_CPU__
+
+#ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_volume_instancing(kg, ray, isect);
+#endif /* __INSTANCING__ */
+
+	return bvh_intersect_volume(kg, ray, isect);
 #else /* __KERNEL_CPU__ */
 
 #ifdef __INSTANCING__
-	return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+	return bvh_intersect_volume_instancing(kg, ray, isect);
 #else
-	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+	return bvh_intersect_volume(kg, ray, isect);
 #endif /* __INSTANCING__ */
 
 #endif /* __KERNEL_CPU__ */
 }
 #endif
 
+#ifdef __VOLUME_RECORD_ALL__
+ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
+                                                     const Ray *ray,
+                                                     Intersection *isect,
+                                                     const uint max_hits)
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_volume_all_hair_motion(kg, ray, isect, max_hits);
+#endif /* __HAIR__ */
+
+		return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits);
+	}
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_volume_all_hair(kg, ray, isect, max_hits);
+#endif /* __HAIR__ */
+
+#ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_volume_all_instancing(kg, ray, isect, max_hits);
+#endif /* __INSTANCING__ */
+
+	return bvh_intersect_volume_all(kg, ray, isect, max_hits);
+}
+#endif
+
 
 /* Ray offset to avoid self intersection.
  *
@@ -311,5 +447,21 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
 
+#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
+/* ToDo: Move to another file? */
+ccl_device int intersections_compare(const void *a, const void *b)
+{
+	const Intersection *isect_a = (const Intersection*)a;
+	const Intersection *isect_b = (const Intersection*)b;
+
+	if(isect_a->t < isect_b->t)
+		return -1;
+	else if(isect_a->t > isect_b->t)
+		return 1;
+	else
+		return 0;
+}
+#endif
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h
index aee4097d77e..e4cba99dc96 100644
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h
@@ -17,6 +17,10 @@
  * limitations under the License.
  */
 
+#ifdef __QBVH__
+#include "geom_qbvh_shadow.h"
+#endif
+
 /* This is a template BVH traversal function, where various features can be
  * enabled/disabled. This way we can compile optimized versions for each case
  * without new features slowing things down.
@@ -27,10 +31,11 @@
  *
  */
 
-#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
-
-ccl_device bool BVH_FUNCTION_NAME
-(KernelGlobals *kg, const Ray *ray, Intersection *isect_array, const uint max_hits, uint *num_hits)
+ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect_array,
+                                            const uint max_hits,
+                                            uint *num_hits)
 {
 	/* todo:
 	 * - likely and unlikely for if() statements
@@ -53,11 +58,11 @@ ccl_device bool BVH_FUNCTION_NAME
 	int object = OBJECT_NONE;
 	float isect_t = tmax;
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 	Transform ob_tfm;
 #endif
 
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 	int num_hits_in_instance = 0;
 #endif
 
@@ -81,6 +86,9 @@ ccl_device bool BVH_FUNCTION_NAME
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
 	/* traversal loop */
 	do {
 		do {
@@ -174,6 +182,7 @@ ccl_device bool BVH_FUNCTION_NAME
 					}
 
 					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
 					traversalStack[stackPtr] = nodeAddrChild1;
 				}
 				else {
@@ -191,13 +200,15 @@ ccl_device bool BVH_FUNCTION_NAME
 
 			/* if node is leaf, fetch triangle list */
 			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
 				int primAddr = __float_as_int(leaf.x);
 
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 				if(primAddr >= 0) {
 #endif
-					int primAddr2 = __float_as_int(leaf.y);
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
 
 					/* pop */
 					nodeAddr = traversalStack[stackPtr];
@@ -205,25 +216,26 @@ ccl_device bool BVH_FUNCTION_NAME
 
 					/* primitive intersection */
 					while(primAddr < primAddr2) {
+						kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+
 						bool hit;
-						uint type = kernel_tex_fetch(__prim_type, primAddr);
 
 						/* todo: specialized intersect functions which don't fill in
 						 * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
 						 * might give a few % performance improvement */
 
-						switch(type & PRIMITIVE_ALL) {
+						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
-								hit = triangle_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr);
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr);
 								break;
 							}
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 							case PRIMITIVE_MOTION_TRIANGLE: {
 								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr);
 								break;
 							}
 #endif
-#if FEATURE(BVH_HAIR)
+#if BVH_FEATURE(BVH_HAIR)
 							case PRIMITIVE_CURVE:
 							case PRIMITIVE_MOTION_CURVE: {
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
@@ -252,7 +264,7 @@ ccl_device bool BVH_FUNCTION_NAME
 							if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
 #endif
 							{
-								shader =  kernel_tex_fetch(__tri_shader, prim);
+								shader = kernel_tex_fetch(__tri_shader, prim);
 							}
 #ifdef __HAIR__
 							else {
@@ -274,7 +286,7 @@ ccl_device bool BVH_FUNCTION_NAME
 							/* move on to next entry in intersections array */
 							isect_array++;
 							(*num_hits)++;
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 							num_hits_in_instance++;
 #endif
 
@@ -284,52 +296,55 @@ ccl_device bool BVH_FUNCTION_NAME
 						primAddr++;
 					}
 				}
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 				else {
 					/* instance push */
 					object = kernel_tex_fetch(__prim_object, -primAddr-1);
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
 #else
 					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
 #endif
 
+					triangle_intersect_precalc(dir, &isect_precalc);
 					num_hits_in_instance = 0;
+					isect_array->t = isect_t;
 
 #if defined(__KERNEL_SSE2__)
 					Psplat[0] = ssef(P.x);
 					Psplat[1] = ssef(P.y);
 					Psplat[2] = ssef(P.z);
 
-					isect_array->t = isect_t;
 					tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-
 					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
 					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
 					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
 
 					nodeAddr = kernel_tex_fetch(__object_node, object);
 				}
 			}
-#endif
+#endif  /* FEATURE(BVH_INSTANCING) */
 		} while(nodeAddr != ENTRYPOINT_SENTINEL);
 
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 		if(stackPtr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
 			if(num_hits_in_instance) {
 				float t_fac;
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm);
 #else
 				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
 #endif
 
+				triangle_intersect_precalc(dir, &isect_precalc);
+
 				/* scale isect->t to adjust for instancing */
 				for(int i = 0; i < num_hits_in_instance; i++)
 					(isect_array-i-1)->t *= t_fac;
@@ -337,22 +352,23 @@ ccl_device bool BVH_FUNCTION_NAME
 			else {
 				float ignore_t = FLT_MAX;
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm);
 #else
 				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
 #endif
+				triangle_intersect_precalc(dir, &isect_precalc);
 			}
 
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
 #if defined(__KERNEL_SSE2__)
 			Psplat[0] = ssef(P.x);
 			Psplat[1] = ssef(P.y);
 			Psplat[2] = ssef(P.z);
 
-			isect_t = tmax;
-			isect_array->t = isect_t;
 			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
-
 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
@@ -360,13 +376,37 @@ ccl_device bool BVH_FUNCTION_NAME
 			nodeAddr = traversalStack[stackPtr];
 			--stackPtr;
 		}
-#endif
+#endif  /* FEATURE(BVH_INSTANCING) */
 	} while(nodeAddr != ENTRYPOINT_SENTINEL);
 
 	return false;
 }
 
-#undef FEATURE
+ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect_array,
+                                         const uint max_hits,
+                                         uint *num_hits)
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect_array,
+		                                    max_hits,
+		                                    num_hits);
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect_array,
+		                                   max_hits,
+		                                   num_hits);
+	}
+}
+
 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
-
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
index a8f57cffa78..a73139f9c88 100644
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@@ -17,6 +17,10 @@
  * limitations under the License.
  */
 
+#ifdef __QBVH__
+#include "geom_qbvh_subsurface.h"
+#endif
+
 /* This is a template BVH traversal function for subsurface scattering, where
  * various features can be enabled/disabled. This way we can compile optimized
  * versions for each case without new features slowing things down.
@@ -26,10 +30,12 @@
  *
  */
 
-#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
-
-ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersection *isect_array,
-	int subsurface_object, uint *lcg_state, int max_hits)
+ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect_array,
+                                            int subsurface_object,
+                                            uint *lcg_state,
+                                            int max_hits)
 {
 	/* todo:
 	 * - test if pushing distance on the stack helps (for non shadow rays)
@@ -54,10 +60,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 	int object = OBJECT_NONE;
 	float isect_t = ray->t;
 
-	const uint visibility = PATH_RAY_ALL_VISIBILITY;
 	uint num_hits = 0;
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 	Transform ob_tfm;
 #endif
 
@@ -78,6 +83,9 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
 	/* traversal loop */
 	do {
 		do
@@ -118,14 +126,8 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
 
 				/* decide which nodes to traverse next */
-#ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
-#else
 				traverseChild0 = (c0max >= c0min);
 				traverseChild1 = (c1max >= c1min);
-#endif
 
 #else // __KERNEL_SSE2__
 				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
@@ -145,14 +147,8 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
 
 				/* decide which nodes to traverse next */
-#ifdef __VISIBILITY_FLAG__
-				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
-#else
 				traverseChild0 = (movemask(lrhit) & 1);
 				traverseChild1 = (movemask(lrhit) & 2);
-#endif
 #endif // __KERNEL_SSE2__
 
 				nodeAddr = __float_as_int(cnodes.x);
@@ -173,6 +169,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 					}
 
 					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
 					traversalStack[stackPtr] = nodeAddrChild1;
 				}
 				else {
@@ -190,57 +187,64 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 
 			/* if node is leaf, fetch triangle list */
 			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
 				int primAddr = __float_as_int(leaf.x);
 
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 				if(primAddr >= 0) {
 #endif
-					int primAddr2 = __float_as_int(leaf.y);
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
 
 					/* pop */
 					nodeAddr = traversalStack[stackPtr];
 					--stackPtr;
 
 					/* primitive intersection */
-					for(; primAddr < primAddr2; primAddr++) {
-						/* only primitives from the same object */
-						uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
-
-						if(tri_object != subsurface_object)
-							continue;
-
-						/* intersect ray against primitive */
-						uint type = kernel_tex_fetch(__prim_type, primAddr);
-
-						switch(type & PRIMITIVE_ALL) {
-							case PRIMITIVE_TRIANGLE: {
-								triangle_intersect_subsurface(kg, isect_array, P, dir, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
-								break;
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from the same object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								if(tri_object != subsurface_object)
+									continue;
+								triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
 							}
-#if FEATURE(BVH_MOTION)
-							case PRIMITIVE_MOTION_TRIANGLE: {
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from the same object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								if(tri_object != subsurface_object)
+									continue;
 								motion_triangle_intersect_subsurface(kg, isect_array, P, dir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
-								break;
 							}
+							break;
+						}
 #endif
-							default: {
-								break;
-							}
+						default: {
+							break;
 						}
 					}
 				}
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 				else {
 					/* instance push */
 					if(subsurface_object == kernel_tex_fetch(__prim_object, -primAddr-1)) {
 						object = subsurface_object;
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
 #else
 						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
 #endif
+						triangle_intersect_precalc(dir, &isect_precalc);
 
 #if defined(__KERNEL_SSE2__)
 						Psplat[0] = ssef(P.x);
@@ -253,6 +257,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 #endif
 
 						++stackPtr;
+						kernel_assert(stackPtr < BVH_STACK_SIZE);
 						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
 
 						nodeAddr = kernel_tex_fetch(__object_node, object);
@@ -264,20 +269,22 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 					}
 				}
 			}
-#endif
+#endif  /* FEATURE(BVH_INSTANCING) */
 		} while(nodeAddr != ENTRYPOINT_SENTINEL);
 
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 		if(stackPtr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
 			/* instance pop */
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
 #else
 			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect_t);
 #endif
 
+			triangle_intersect_precalc(dir, &isect_precalc);
+
 #if defined(__KERNEL_SSE2__)
 			Psplat[0] = ssef(P.x);
 			Psplat[1] = ssef(P.y);
@@ -292,13 +299,40 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 			nodeAddr = traversalStack[stackPtr];
 			--stackPtr;
 		}
-#endif
+#endif  /* FEATURE(BVH_INSTANCING) */
 	} while(nodeAddr != ENTRYPOINT_SENTINEL);
 
 	return num_hits;
 }
 
-#undef FEATURE
+ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect_array,
+                                         int subsurface_object,
+                                         uint *lcg_state,
+                                         int max_hits)
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect_array,
+		                                    subsurface_object,
+		                                    lcg_state,
+		                                    max_hits);
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect_array,
+		                                   subsurface_object,
+		                                   lcg_state,
+		                                   max_hits);
+	}
+}
+
 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
-
diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h
index e39228c33de..73d79fd78ee 100644
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@@ -17,6 +17,10 @@
  * limitations under the License.
  */
 
+#ifdef __QBVH__
+#include "geom_qbvh_traversal.h"
+#endif
+
 /* This is a template BVH traversal function, where various features can be
  * enabled/disabled. This way we can compile optimized versions for each case
  * without new features slowing things down.
@@ -28,14 +32,16 @@
  *
  */
 
-#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
-
-ccl_device bool BVH_FUNCTION_NAME
-(KernelGlobals *kg, const Ray *ray, Intersection *isect, const uint visibility
-#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-, uint *lcg_state, float difl, float extmax
+ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect,
+                                            const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                            , uint *lcg_state,
+                                            float difl,
+                                            float extmax
 #endif
-)
+                                            )
 {
 	/* todo:
 	 * - test if pushing distance on the stack helps (for non shadow rays)
@@ -58,15 +64,20 @@ ccl_device bool BVH_FUNCTION_NAME
 	float3 idir = bvh_inverse_direction(dir);
 	int object = OBJECT_NONE;
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 	Transform ob_tfm;
 #endif
 
 	isect->t = ray->t;
-	isect->object = OBJECT_NONE;
-	isect->prim = PRIM_NONE;
 	isect->u = 0.0f;
 	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+#if defined(__KERNEL_DEBUG__)
+	isect->num_traversal_steps = 0;
+	isect->num_traversed_instances = 0;
+#endif
 
 #if defined(__KERNEL_SSE2__)
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
@@ -85,6 +96,9 @@ ccl_device bool BVH_FUNCTION_NAME
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
 	/* traversal loop */
 	do {
 		do {
@@ -122,7 +136,7 @@ ccl_device bool BVH_FUNCTION_NAME
 				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
 				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
 
-#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
 					float hdiff = 1.0f + difl;
 					float ldiff = 1.0f - difl;
@@ -163,7 +177,7 @@ ccl_device bool BVH_FUNCTION_NAME
 				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
 				const ssef tminmax = minmax ^ pn;
 
-#if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
 					float4 *tminmaxview = (float4*)&tminmax;
 					float &c0min = tminmaxview->x, &c1min = tminmaxview->y;
@@ -213,6 +227,7 @@ ccl_device bool BVH_FUNCTION_NAME
 					}
 
 					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
 					traversalStack[stackPtr] = nodeAddrChild1;
 				}
 				else {
@@ -226,80 +241,112 @@ ccl_device bool BVH_FUNCTION_NAME
 						--stackPtr;
 					}
 				}
+
+#if defined(__KERNEL_DEBUG__)
+				isect->num_traversal_steps++;
+#endif
 			}
 
 			/* if node is leaf, fetch triangle list */
 			if(nodeAddr < 0) {
-				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
 				int primAddr = __float_as_int(leaf.x);
 
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 				if(primAddr >= 0) {
 #endif
-					int primAddr2 = __float_as_int(leaf.y);
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
 
 					/* pop */
 					nodeAddr = traversalStack[stackPtr];
 					--stackPtr;
 
 					/* primitive intersection */
-					while(primAddr < primAddr2) {
-						bool hit;
-						uint type = kernel_tex_fetch(__prim_type, primAddr);
-
-						switch(type & PRIMITIVE_ALL) {
-							case PRIMITIVE_TRIANGLE: {
-								hit = triangle_intersect(kg, isect, P, dir, visibility, object, primAddr);
-								break;
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+#if defined(__KERNEL_DEBUG__)
+								isect->num_traversal_steps++;
+#endif
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) {
+									/* shadow ray early termination */
+#if defined(__KERNEL_SSE2__)
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#else
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+#endif
+								}
 							}
-#if FEATURE(BVH_MOTION)
-							case PRIMITIVE_MOTION_TRIANGLE: {
-								hit = motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
-								break;
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+#if defined(__KERNEL_DEBUG__)
+								isect->num_traversal_steps++;
+#endif
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) {
+									/* shadow ray early termination */
+#if defined(__KERNEL_SSE2__)
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+#else
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+#endif
+								}
 							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_MOTION) */
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							for(; primAddr < primAddr2; primAddr++) {
+#if defined(__KERNEL_DEBUG__)
+								isect->num_traversal_steps++;
 #endif
-#if FEATURE(BVH_HAIR)
-							case PRIMITIVE_CURVE:
-							case PRIMITIVE_MOTION_CURVE: {
-								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								bool hit;
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
 									hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
 								else
 									hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
-								break;
-							}
-#endif
-							default: {
-								hit = false;
-								break;
-							}
-						}
-
-						/* shadow ray early termination */
+								if(hit) {
+									/* shadow ray early termination */
 #if defined(__KERNEL_SSE2__)
-						if(hit) {
-							if(visibility == PATH_RAY_SHADOW_OPAQUE)
-								return true;
-
-							tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
-						}
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
 #else
-						if(hit && visibility == PATH_RAY_SHADOW_OPAQUE)
-							return true;
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
 #endif
-
-						primAddr++;
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_HAIR) */
 					}
 				}
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 				else {
 					/* instance push */
 					object = kernel_tex_fetch(__prim_object, -primAddr-1);
 
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
 #else
 					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
 #endif
+					triangle_intersect_precalc(dir, &isect_precalc);
 
 #if defined(__KERNEL_SSE2__)
 					Psplat[0] = ssef(P.x);
@@ -312,24 +359,30 @@ ccl_device bool BVH_FUNCTION_NAME
 #endif
 
 					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
 					traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
 
 					nodeAddr = kernel_tex_fetch(__object_node, object);
+
+#if defined(__KERNEL_DEBUG__)
+					isect->num_traversed_instances++;
+#endif
 				}
 			}
-#endif
+#endif  /* FEATURE(BVH_INSTANCING) */
 		} while(nodeAddr != ENTRYPOINT_SENTINEL);
 
-#if FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_INSTANCING)
 		if(stackPtr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
 			/* instance pop */
-#if FEATURE(BVH_MOTION)
+#if BVH_FEATURE(BVH_MOTION)
 			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
 #else
 			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
 #endif
+			triangle_intersect_precalc(dir, &isect_precalc);
 
 #if defined(__KERNEL_SSE2__)
 			Psplat[0] = ssef(P.x);
@@ -345,13 +398,52 @@ ccl_device bool BVH_FUNCTION_NAME
 			nodeAddr = traversalStack[stackPtr];
 			--stackPtr;
 		}
-#endif
+#endif  /* FEATURE(BVH_INSTANCING) */
 	} while(nodeAddr != ENTRYPOINT_SENTINEL);
 
 	return (isect->prim != PRIM_NONE);
 }
 
-#undef FEATURE
+ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect,
+                                         const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                         , uint *lcg_state,
+                                         float difl,
+                                         float extmax
+#endif
+                                         )
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect,
+		                                    visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+		                                    , lcg_state,
+		                                    difl,
+		                                    extmax
+#endif
+		                                    );
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect,
+		                                   visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+		                                   , lcg_state,
+		                                   difl,
+		                                   extmax
+#endif
+		                                   );
+	}
+}
+
 #undef BVH_FUNCTION_NAME
 #undef BVH_FUNCTION_FEATURES
-
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h
new file mode 100644
index 00000000000..41c784869f2
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_bvh_volume.h
@@ -0,0 +1,358 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+#include "geom_qbvh_volume.h"
+#endif
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect)
+{
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - test restrict attribute for pointers
+	 */
+
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+	const uint visibility = PATH_RAY_ALL_VISIBILITY;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+	
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
+
+	ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* traversal loop */
+	do {
+		do {
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				bool traverseChild0, traverseChild1;
+				int nodeAddrChild1;
+
+#if !defined(__KERNEL_SSE2__)
+				/* Intersect two child bounding boxes, non-SSE version */
+				float t = isect->t;
+
+				/* fetch node data */
+				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
+				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
+				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
+
+				/* intersect ray against child nodes */
+				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+				/* decide which nodes to traverse next */
+				traverseChild0 = (c0max >= c0min);
+				traverseChild1 = (c1max >= c1min);
+
+#else // __KERNEL_SSE2__
+				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+
+				/* fetch node data */
+				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const float4 cnodes = ((float4*)bvh_nodes)[3];
+
+				/* intersect ray against child nodes */
+				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+				/* calculate { c0min, c1min, -c0max, -c1max} */
+				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+				const ssef tminmax = minmax ^ pn;
+
+				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+				/* decide which nodes to traverse next */
+				traverseChild0 = (movemask(lrhit) & 1);
+				traverseChild1 = (movemask(lrhit) & 2);
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.x);
+				nodeAddrChild1 = __float_as_int(cnodes.y);
+
+				if(traverseChild0 && traverseChild1) {
+					/* both children were intersected, push the farther one */
+#if !defined(__KERNEL_SSE2__)
+					bool closestChild1 = (c1min < c0min);
+#else
+					bool closestChild1 = tminmax[1] < tminmax[0];
+#endif
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* one child was intersected */
+					if(traverseChild1) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(!traverseChild0) {
+						/* neither child was intersected */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+
+					/* pop */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* primitive intersection */
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr);
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
+							}
+							break;
+						}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+							}
+							break;
+						}
+#endif
+						default: {
+							break;
+						}
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* instance push */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+						triangle_intersect_precalc(dir, &isect_precalc);
+
+#if defined(__KERNEL_SSE2__)
+						Psplat[0] = ssef(P.x);
+						Psplat[1] = ssef(P.y);
+						Psplat[2] = ssef(P.z);
+
+						tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+
+						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_STACK_SIZE);
+						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* pop */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* instance pop */
+#if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+#if defined(__KERNEL_SSE2__)
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
+
+			tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_MOTION) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
+
+ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect)
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect);
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect);
+	}
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume_all.h b/intern/cycles/kernel/geom/geom_bvh_volume_all.h
new file mode 100644
index 00000000000..b6db36f4b17
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_bvh_volume_all.h
@@ -0,0 +1,454 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __QBVH__
+#include "geom_qbvh_volume_all.h"
+#endif
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                            const Ray *ray,
+                                            Intersection *isect_array,
+                                            const uint max_hits)
+{
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - test restrict attribute for pointers
+	 */
+
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+	const uint visibility = PATH_RAY_ALL_VISIBILITY;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	uint num_hits = 0;
+	isect_array->t = tmax;
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+	
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
+
+	ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* traversal loop */
+	do {
+		do {
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				bool traverseChild0, traverseChild1;
+				int nodeAddrChild1;
+
+#if !defined(__KERNEL_SSE2__)
+				/* Intersect two child bounding boxes, non-SSE version */
+				float t = isect_array->t;
+
+				/* fetch node data */
+				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
+				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
+				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
+
+				/* intersect ray against child nodes */
+				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+				/* decide which nodes to traverse next */
+				traverseChild0 = (c0max >= c0min);
+				traverseChild1 = (c1max >= c1min);
+
+#else // __KERNEL_SSE2__
+				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+
+				/* fetch node data */
+				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const float4 cnodes = ((float4*)bvh_nodes)[3];
+
+				/* intersect ray against child nodes */
+				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+				/* calculate { c0min, c1min, -c0max, -c1max} */
+				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+				const ssef tminmax = minmax ^ pn;
+
+				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+				/* decide which nodes to traverse next */
+				traverseChild0 = (movemask(lrhit) & 1);
+				traverseChild1 = (movemask(lrhit) & 2);
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.x);
+				nodeAddrChild1 = __float_as_int(cnodes.y);
+
+				if(traverseChild0 && traverseChild1) {
+					/* both children were intersected, push the farther one */
+#if !defined(__KERNEL_SSE2__)
+					bool closestChild1 = (c1min < c0min);
+#else
+					bool closestChild1 = tminmax[1] < tminmax[0];
+#endif
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_STACK_SIZE);
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* one child was intersected */
+					if(traverseChild1) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(!traverseChild0) {
+						/* neither child was intersected */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_NODE_LEAF_SIZE);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					const int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					bool hit;
+
+					/* pop */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* primitive intersection */
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#  else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							/* intersect ray against primitive */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* only primitives from volume object */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									hit = bvh_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#  else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+						default: {
+							break;
+						}
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* instance push */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+						triangle_intersect_precalc(dir, &isect_precalc);
+						num_hits_in_instance = 0;
+						isect_array->t = isect_t;
+
+#if defined(__KERNEL_SSE2__)
+						Psplat[0] = ssef(P.x);
+						Psplat[1] = ssef(P.y);
+						Psplat[2] = ssef(P.z);
+
+						tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+
+						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_STACK_SIZE);
+						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* pop */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			if(num_hits_in_instance) {
+				float t_fac;
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm);
+#else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+				/* Scale isect->t to adjust for instancing. */
+				for(int i = 0; i < num_hits_in_instance; i++) {
+					(isect_array-i-1)->t *= t_fac;
+				}
+			}
+			else {
+				float ignore_t = FLT_MAX;
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm);
+#else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+			}
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+#if defined(__KERNEL_SSE2__)
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
+
+			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
+
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_MOTION) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return num_hits;
+}
+
+ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                         const Ray *ray,
+                                         Intersection *isect_array,
+                                         const uint max_hits)
+{
+#ifdef __QBVH__
+	if(kernel_data.bvh.use_qbvh) {
+		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
+		                                    ray,
+		                                    isect_array,
+		                                    max_hits);
+	}
+	else
+#endif
+	{
+		kernel_assert(kernel_data.bvh.use_qbvh == false);
+		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
+		                                   ray,
+		                                   isect_array,
+		                                   max_hits);
+	}
+}
+
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index c4e9e2ababe..9653ad8f1bb 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -32,22 +32,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 		if(dy) *dy = 0.0f;
 #endif
 
-		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
+		return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim));
 	}
 	else if(elem == ATTR_ELEMENT_CURVE_KEY || elem == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 		int k1 = k0 + 1;
 
 		float f0 = kernel_tex_fetch(__attributes_float, offset + k0);
 		float f1 = kernel_tex_fetch(__attributes_float, offset + k1);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*(f1 - f0);
+		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
 		if(dy) *dy = 0.0f;
 #endif
 
-		return (1.0f - sd->u)*f0 + sd->u*f1;
+		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -71,22 +71,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim)));
 	}
 	else if(elem == ATTR_ELEMENT_CURVE_KEY || elem == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 		int k1 = k0 + 1;
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k0));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + k1));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*(f1 - f0);
+		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return (1.0f - sd->u)*f0 + sd->u*f1;
+		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -104,22 +104,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 {
 	float r = 0.0f;
 
-	if(sd->type & PRIMITIVE_ALL_CURVE) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 		int k1 = k0 + 1;
 
 		float4 P_curve[2];
 
-		if(sd->type & PRIMITIVE_CURVE) {
+		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
 		}
 
-		r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w;
+		r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w;
 	}
 
 	return r*2.0f;
@@ -130,8 +130,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 
 ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
 {
-	float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 	int k1 = k0 + 1;
 
 	float4 P_curve[2];
@@ -139,7 +139,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 	P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 	P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 
-	return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u);
+	return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u));
 }
 
 /* Curve tangent normal */
@@ -148,14 +148,14 @@ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
 {	
 	float3 tgN = make_float3(0.0f,0.0f,0.0f);
 
-	if(sd->type & PRIMITIVE_ALL_CURVE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 
-		tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu)));
+		tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu))));
 		tgN = normalize(tgN);
 
 		/* need to find suitable scaled gd for corrected normal */
 #if 0
-		tgN = normalize(tgN - gd * sd->dPdu);
+		tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu));
 #endif
 	}
 
@@ -442,12 +442,12 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 		float r_ext = mw_extension + r_curr;
 		float coverage = 1.0f;
 
-		if (bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
+		if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
 			/* the bounding box does not overlap the square centered at O */
 			tree += level;
 			level = tree & -tree;
 		}
-		else if (level == 1) {
+		else if(level == 1) {
 
 			/* the maximum recursion depth is reached.
 			* check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
@@ -459,13 +459,13 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 			if(flags & CURVE_KN_RIBBONS) {
 				float3 tg = (p_en - p_st);
 				float w = tg.x * tg.x + tg.y * tg.y;
-				if (w == 0) {
+				if(w == 0) {
 					tree++;
 					level = tree & -tree;
 					continue;
 				}
 				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
-				w = clamp((float)w, 0.0f, 1.0f);
+				w = saturate(w);
 
 				/* compute u on the curve segment */
 				u = i_st * (1 - w) + i_en * w;
@@ -474,17 +474,17 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
 
 				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if (dot(tg, dp_st)< 0)
+				if(dot(tg, dp_st)< 0)
 					dp_st *= -1;
-				if (dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
+				if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
 					tree++;
 					level = tree & -tree;
 					continue;
 				}
 				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if (dot(tg, dp_en) < 0)
+				if(dot(tg, dp_en) < 0)
 					dp_en *= -1;
-				if (dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
+				if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
 					tree++;
 					level = tree & -tree;
 					continue;
@@ -500,13 +500,13 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 					float d0 = d - r_curr;
 					float d1 = d + r_curr;
 					float inv_mw_extension = 1.0f/mw_extension;
-					if (d0 >= 0)
+					if(d0 >= 0)
 						coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
 					else // inside
 						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
 				}
 				
-				if (p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
+				if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
 					tree++;
 					level = tree & -tree;
 					continue;
@@ -548,7 +548,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
 				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
 				float td = tb*tb - 4*cyla*tc;
-				if (td < 0.0f) {
+				if(td < 0.0f) {
 					tree++;
 					level = tree & -tree;
 					continue;
@@ -559,10 +559,10 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 				t = tcentre + correction;
 
 				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if (dot(tg, dp_st)< 0)
+				if(dot(tg, dp_st)< 0)
 					dp_st *= -1;
 				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if (dot(tg, dp_en) < 0)
+				if(dot(tg, dp_en) < 0)
 					dp_en *= -1;
 
 				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
@@ -570,14 +570,14 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 					t = tcentre + correction;
 				}			
 
-				if (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
+				if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
 					tree++;
 					level = tree & -tree;
 					continue;
 				}
 
 				float w = (zcentre + (tg.z * correction)) * invl;
-				w = clamp((float)w, 0.0f, 1.0f);
+				w = saturate(w);
 				/* compute u on the curve segment */
 				u = i_st * (1 - w) + i_en * w;
 
@@ -600,12 +600,12 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 #endif
 			{
 				/* record intersection */
+				isect->t = t;
+				isect->u = u;
+				isect->v = gd;
 				isect->prim = curveAddr;
 				isect->object = object;
 				isect->type = type;
-				isect->u = u;
-				isect->v = gd;
-				isect->t = t;
 				hit = true;
 			}
 			
@@ -646,8 +646,8 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 	float4 P_curve[2];
 
 	if(type & PRIMITIVE_CURVE) {
-		P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
-		P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
+		P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
+		P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
 	}
 	else {
 		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
@@ -709,7 +709,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 	const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
 	const ssef dir = load4f(direction);
 	const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
-	const ssef sphere_dif2 = nmsub(sphere_b_tmp, dir, sphere_dif1);
+	const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1);
 #endif
 
 	float mr = max(r1, r2);
@@ -777,7 +777,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 	float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
 	float td = tb*tb - 4*a*tc;
 
-	if (td < 0.0f)
+	if(td < 0.0f)
 		return false;
 
 	float rootd = 0.0f;
@@ -818,7 +818,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 
 		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
 
-			if (flags & CURVE_KN_ENCLOSEFILTER) {
+			if(flags & CURVE_KN_ENCLOSEFILTER) {
 				float enc_ratio = 1.01f;
 				if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
 					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
@@ -835,12 +835,12 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 #endif
 			{
 				/* record intersection */
+				isect->t = t;
+				isect->u = z*invl;
+				isect->v = gd;
 				isect->prim = curveAddr;
 				isect->object = object;
 				isect->type = type;
-				isect->u = z*invl;
-				isect->v = gd;
-				isect->t = t;
 
 				return true;
 			}
@@ -890,7 +890,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
+		Transform tfm = ccl_fetch(sd, ob_itfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #endif
@@ -903,7 +903,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 	int prim = kernel_tex_fetch(__prim_index, isect->prim);
 	float4 v00 = kernel_tex_fetch(__curves, prim);
 
-	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 	int k1 = k0 + 1;
 
 	float3 tg;
@@ -914,14 +914,14 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 
 		float4 P_curve[4];
 
-		if(sd->type & PRIMITIVE_CURVE) {
+		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
 			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
 			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
 			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
 			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
 		}
 		else {
-			motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
+			motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve);
 		}
 
 		float3 p[4];
@@ -933,43 +933,43 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		P = P + D*t;
 
 #ifdef __UV__
-		sd->u = isect->u;
-		sd->v = 0.0f;
+		ccl_fetch(sd, u) = isect->u;
+		ccl_fetch(sd, v) = 0.0f;
 #endif
 
 		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
 
 		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
+			ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D))));
 		}
 		else {
 			/* direction from inside to surface of curve */
 			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
-			sd->Ng = normalize(P - p_curr);
+			ccl_fetch(sd, Ng) = normalize(P - p_curr);
 
 			/* adjustment for changing radius */
 			float gd = isect->v;
 
 			if(gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg;
-				sd->Ng = normalize(sd->Ng);
+				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
+				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
 			}
 		}
 
 		/* todo: sometimes the normal is still so that this is detected as
 		 * backfacing even if cull backfaces is enabled */
 
-		sd->N = sd->Ng;
+		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
 	}
 	else {
 		float4 P_curve[2];
 
-		if(sd->type & PRIMITIVE_CURVE) {
+		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
 		}
 
 		float l = 1.0f;
@@ -980,39 +980,39 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		float3 dif = P - float4_to_float3(P_curve[0]);
 
 #ifdef __UV__
-		sd->u = dot(dif,tg)/l;
-		sd->v = 0.0f;
+		ccl_fetch(sd, u) = dot(dif,tg)/l;
+		ccl_fetch(sd, v) = 0.0f;
 #endif
 
-		if (flag & CURVE_KN_TRUETANGENTGNORMAL) {
-			sd->Ng = -(D - tg * dot(tg, D));
-			sd->Ng = normalize(sd->Ng);
+		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
+			ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D));
+			ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
 		}
 		else {
 			float gd = isect->v;
 
 			/* direction from inside to surface of curve */
-			sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
+			ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd);
 
 			/* adjustment for changing radius */
-			if (gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg;
-				sd->Ng = normalize(sd->Ng);
+			if(gd != 0.0f) {
+				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
+				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
 			}
 		}
 
-		sd->N = sd->Ng;
+		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
 	}
 
 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	sd->dPdu = tg;
-	sd->dPdv = cross(tg, sd->Ng);
+	ccl_fetch(sd, dPdu) = tg;
+	ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng));
 #endif
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
+		Transform tfm = ccl_fetch(sd, ob_tfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #endif
diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h
index 1022a957b05..6de5aa7ea99 100644
--- a/intern/cycles/kernel/geom/geom_motion_curve.h
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -27,17 +27,22 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg, int object, uint id, AttributeElement *elem)
 {
-	/* todo: find a better (faster) solution for this, maybe store offset per object */
+	/* todo: find a better (faster) solution for this, maybe store offset per object.
+	 *
+	 * NOTE: currently it's not a bottleneck because in test scenes the loop below runs
+	 * zero iterations and rendering is really slow with motion curves. For until other
+	 * areas are speed up it's probably not so crucial to optimize this out.
+	 */
 	uint attr_offset = object*kernel_data.bvh.attributes_map_stride + ATTR_PRIM_CURVE;
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
-	
+
 	while(attr_map.x != id) {
 		attr_offset += ATTR_PRIM_TYPES;
 		attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	}
 
 	*elem = (AttributeElement)attr_map.y;
-	
+
 	/* return result */
 	return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
 }
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index b275b89a8a4..86f93f242a1 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -130,8 +130,11 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, ShaderData *s
 
 #ifdef __INTERSECTION_REFINE__
 	if(isect->object != OBJECT_NONE) {
+		if(UNLIKELY(t == 0.0f)) {
+			return P;
+		}
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
+		Transform tfm = ccl_fetch(sd, ob_itfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #endif
@@ -158,7 +161,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, ShaderData *s
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
+		Transform tfm = ccl_fetch(sd, ob_tfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #endif
@@ -184,7 +187,7 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh
 #ifdef __INTERSECTION_REFINE__
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
+		Transform tfm = ccl_fetch(sd, ob_itfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #endif
@@ -210,7 +213,7 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
+		Transform tfm = ccl_fetch(sd, ob_tfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #endif
@@ -233,25 +236,25 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh
 ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface)
 {
 	/* get shader */
-	sd->shader =  kernel_tex_fetch(__tri_shader, sd->prim);
+	ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
 
 	/* get motion info */
 	int numsteps, numverts;
-	object_motion_info(kg, sd->object, &numsteps, &numverts, NULL);
+	object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);
 
 	/* figure out which steps we need to fetch and their interpolation factor */
 	int maxstep = numsteps*2;
-	int step = min((int)(sd->time*maxstep), maxstep-1);
-	float t = sd->time*maxstep - step;
+	int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
+	float t = ccl_fetch(sd, time)*maxstep - step;
 
 	/* find attribute */
 	AttributeElement elem;
-	int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_POSITION, &elem);
+	int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_POSITION, &elem);
 	kernel_assert(offset != ATTR_STD_NOT_FOUND);
 
 	/* fetch vertex coordinates */
 	float3 verts[3], next_verts[3];
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)));
 
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
@@ -265,33 +268,33 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
 #ifdef __SUBSURFACE__
 	if(!subsurface)
 #endif
-		sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
+		ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
 #ifdef __SUBSURFACE__
 	else
-		sd->P = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
+		ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
 #endif
 
 	/* compute face normal */
 	float3 Ng;
-	if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+	if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
 		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
 	else
 		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
 
-	sd->Ng = Ng;
-	sd->N = Ng;
+	ccl_fetch(sd, Ng) = Ng;
+	ccl_fetch(sd, N) = Ng;
 
 	/* compute derivatives of P w.r.t. uv */
 #ifdef __DPDU__
-	sd->dPdu = (verts[0] - verts[2]);
-	sd->dPdv = (verts[1] - verts[2]);
+	ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
+	ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
 #endif
 
 	/* compute smooth normal */
-	if(sd->shader & SHADER_SMOOTH_NORMAL) {
+	if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
 		/* find attribute */
 		AttributeElement elem;
-		int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
+		int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
 		kernel_assert(offset != ATTR_STD_NOT_FOUND);
 
 		/* fetch vertex coordinates */
@@ -305,10 +308,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
 		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
 
 		/* interpolate between vertices */
-		float u = sd->u;
-		float v = sd->v;
+		float u = ccl_fetch(sd, u);
+		float v = ccl_fetch(sd, v);
 		float w = 1.0f - u - v;
-		sd->N = (u*normals[0] + v*normals[1] + w*normals[2]);
+		ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
 	}
 }
 
@@ -336,12 +339,12 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection
 		if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
 #endif
 		{
+			isect->t = t;
+			isect->u = u;
+			isect->v = v;
 			isect->prim = triAddr;
 			isect->object = object;
 			isect->type = PRIMITIVE_MOTION_TRIANGLE;
-			isect->u = u;
-			isect->v = v;
-			isect->t = t;
 		
 			return true;
 		}
@@ -388,12 +391,12 @@ ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, I
 
 		/* record intersection */
 		Intersection *isect = &isect_array[hit];
+		isect->t = t;
+		isect->u = u;
+		isect->v = v;
 		isect->prim = triAddr;
 		isect->object = object;
 		isect->type = PRIMITIVE_MOTION_TRIANGLE;
-		isect->u = u;
-		isect->v = v;
-		isect->t = t;
 	}
 }
 #endif
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index 91edd5863ac..9d0a008fff1 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -123,9 +123,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point(&sd->ob_tfm, *P);
+	*P = transform_point_auto(&ccl_fetch(sd, ob_tfm), *P);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -135,9 +135,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader
 ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point(&sd->ob_itfm, *P);
+	*P = transform_point_auto(&ccl_fetch(sd, ob_itfm), *P);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -147,9 +147,9 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons
 ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed(&sd->ob_tfm, *N));
+	*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N));
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	*N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@@ -159,9 +159,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const
 ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed(&sd->ob_itfm, *N));
+	*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_itfm), *N));
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	*N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@@ -171,9 +171,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa
 ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction(&sd->ob_tfm, *D);
+	*D = transform_direction_auto(&ccl_fetch(sd, ob_tfm), *D);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -183,9 +183,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData
 ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction(&sd->ob_itfm, *D);
+	*D = transform_direction_auto(&ccl_fetch(sd, ob_itfm), *D);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -194,13 +194,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha
 
 ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
 {
-	if(sd->object == OBJECT_NONE)
+	if(ccl_fetch(sd, object) == OBJECT_NONE)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
 #ifdef __OBJECT_MOTION__
-	return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
+	return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w);
 #else
-	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
 	return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
 #endif
 }
@@ -243,7 +243,7 @@ ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
 ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
 {
 	if(object == OBJECT_NONE)
-		return 0.0f;
+		return 0;
 
 	int offset = object*OBJECT_SIZE + OBJECT_PROPERTIES;
 	float4 f = kernel_tex_fetch(__objects, offset);
@@ -296,7 +296,7 @@ ccl_device_inline void object_motion_info(KernelGlobals *kg, int object, int *nu
 
 ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 {
-	return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2 + 1);
+	return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2 + 1);
 }
 
 /* Particle data from which object was instanced */
@@ -377,7 +377,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 
 /* Transform ray into object space to enter static object in BVH */
 
-ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
+ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
 {
 	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 
@@ -391,9 +391,41 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra
 		*t *= len;
 }
 
+#ifdef __QBVH__
+/* Same as above, but optimized for QBVH scene intersection,
+ * which needs to modify two max distances.
+ *
+ * TODO(sergey): Investigate if passing NULL instead of t1 gets optimized
+ * so we can avoid having this duplication.
+ */
+ccl_device_inline void qbvh_instance_push(KernelGlobals *kg,
+                                          int object,
+                                          const Ray *ray,
+                                          float3 *P,
+                                          float3 *dir,
+                                          float3 *idir,
+                                          float *t,
+                                          float *t1)
+{
+	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+
+	*P = transform_point(&tfm, ray->P);
+
+	float len;
+	*dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
+	*idir = bvh_inverse_direction(*dir);
+
+	if(*t != FLT_MAX)
+		*t *= len;
+
+	if(*t1 != -FLT_MAX)
+		*t1 *= len;
+}
+#endif
+
 /* Transorm ray to exit static object in BVH */
 
-ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
+ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
 {
 	if(*t != FLT_MAX) {
 		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
@@ -421,7 +453,7 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, int object, co
 #ifdef __OBJECT_MOTION__
 /* Transform ray into object space to enter motion blurred object in BVH */
 
-ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm)
+ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t, Transform *tfm)
 {
 	Transform itfm;
 	*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
@@ -436,9 +468,36 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg, int object, c
 		*t *= len;
 }
 
+#ifdef __QBVH__
+/* Same as above, but optimized for QBVH scene intersection,
+ * which needs to modify two max distances.
+ *
+ * TODO(sergey): Investigate if passing NULL instead of t1 gets optimized
+ * so we can avoid having this duplication.
+ */
+ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, float *t1, Transform *tfm)
+{
+	Transform itfm;
+	*tfm = object_fetch_transform_motion_test(kg, object, ray->time, &itfm);
+
+	*P = transform_point(&itfm, ray->P);
+
+	float len;
+	*dir = bvh_clamp_direction(normalize_len(transform_direction(&itfm, ray->D), &len));
+	*idir = bvh_inverse_direction(*dir);
+
+
+	if(*t != FLT_MAX)
+		*t *= len;
+
+	if(*t1 != -FLT_MAX)
+		*t1 *= len;
+}
+#endif
+
 /* Transorm ray to exit motion blurred object in BVH */
 
-ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t, Transform *tfm)
+ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t, Transform *tfm)
 {
 	if(*t != FLT_MAX)
 		*t *= len(transform_direction(tfm, 1.0f/(*idir)));
@@ -461,5 +520,38 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg, int obj
 
 #endif
 
+/* TODO(sergey): This is only for until we've got OpenCL 2.0
+ * on all devices we consider supported. It'll be replaced with
+ * generic address space.
+ */
+
+#ifdef __KERNEL_OPENCL__
+ccl_device_inline void object_dir_transform_addrspace(KernelGlobals *kg,
+                                                      const ShaderData *sd,
+                                                      ccl_addr_space float3 *D)
+{
+	float3 private_D = *D;
+	object_dir_transform(kg, sd, &private_D);
+	*D = private_D;
+}
+
+ccl_device_inline void object_normal_transform_addrspace(KernelGlobals *kg,
+                                                         const ShaderData *sd,
+                                                         ccl_addr_space float3 *N)
+{
+	float3 private_N = *N;
+	object_normal_transform(kg, sd, &private_N);
+	*N = private_N;
+}
+#endif
+
+#ifndef __KERNEL_OPENCL__
+#  define object_dir_transform_auto object_dir_transform
+#  define object_normal_transform_auto object_normal_transform
+#else
+#  define object_dir_transform_auto object_dir_transform_addrspace
+#  define object_normal_transform_auto object_normal_transform_addrspace
+#endif
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index 5df6c75df86..30f12d32355 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Primitive Utilities
@@ -25,16 +25,16 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
 {
-	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
 		return triangle_attribute_float(kg, sd, elem, offset, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(sd->type & PRIMITIVE_ALL_CURVE) {
+	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float(kg, sd, elem, offset, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
+	else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float(kg, sd, elem, offset, dx, dy);
 	}
 #endif
@@ -47,16 +47,16 @@ ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *
 
 ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
 {
-	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
 		return triangle_attribute_float3(kg, sd, elem, offset, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(sd->type & PRIMITIVE_ALL_CURVE) {
+	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float3(kg, sd, elem, offset, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
+	else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float3(kg, sd, elem, offset, dx, dy);
 	}
 #endif
@@ -108,9 +108,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(sd->type & PRIMITIVE_ALL_CURVE)
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)
 #ifdef __DPDU__
-		return normalize(sd->dPdu);
+		return normalize(ccl_fetch(sd, dPdu));
 #else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@@ -124,12 +124,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 		float3 data = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
 		data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f);
 		object_normal_transform(kg, sd, &data);
-		return cross(sd->N, normalize(cross(data, sd->N)));
+		return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N))));
 	}
 	else {
 		/* otherwise use surface derivatives */
 #ifdef __DPDU__
-		return normalize(sd->dPdu);
+		return normalize(ccl_fetch(sd, dPdu));
 #else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@@ -144,15 +144,16 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	float3 center;
 
 #ifdef __HAIR__
-	if(sd->type & PRIMITIVE_ALL_CURVE) {
+	bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE;
+	if(is_curve_primitive) {
 		center = curve_motion_center_location(kg, sd);
 
-		if(!(sd->flag & SD_TRANSFORM_APPLIED))
+		if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED))
 			object_position_transform(kg, sd, &center);
 	}
 	else
 #endif
-		center = sd->P;
+		center = ccl_fetch(sd, P);
 
 	float3 motion_pre = center, motion_post = center;
 
@@ -163,30 +164,37 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	if(offset != ATTR_STD_NOT_FOUND) {
 		/* get motion info */
 		int numverts, numkeys;
-		object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
+		object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys);
 
 		/* lookup attributes */
-		int offset_next = (sd->type & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys;
+		int offset_next = (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys;
 
 		motion_pre = primitive_attribute_float3(kg, sd, elem, offset, NULL, NULL);
 		motion_post = primitive_attribute_float3(kg, sd, elem, offset_next, NULL, NULL);
+
+#ifdef __HAIR__
+		if(is_curve_primitive && (ccl_fetch(sd, flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+			object_position_transform(kg, sd, &motion_pre);
+			object_position_transform(kg, sd, &motion_post);
+		}
+#endif
 	}
 
 	/* object motion. note that depending on the mesh having motion vectors, this
 	 * transformation was set match the world/object space of motion_pre/post */
 	Transform tfm;
 	
-	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE);
+	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE);
 	motion_pre = transform_point(&tfm, motion_pre);
 
-	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST);
+	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST);
 	motion_post = transform_point(&tfm, motion_post);
 
 	float3 motion_center;
 
 	/* camera motion, for perspective/orthographic motion.pre/post will be a
 	 * world-to-raster matrix, for panorama it's world-to-camera */
-	if (kernel_data.cam.type != CAMERA_PANORAMA) {
+	if(kernel_data.cam.type != CAMERA_PANORAMA) {
 		tfm = kernel_data.cam.worldtoraster;
 		motion_center = transform_perspective(&tfm, center);
 
diff --git a/intern/cycles/kernel/geom/geom_qbvh.h b/intern/cycles/kernel/geom/geom_qbvh.h
new file mode 100644
index 00000000000..37deaac0800
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+struct QBVHStackItem {
+	int addr;
+	float dist;
+};
+
+/* TOOD(sergey): Investigate if using instrinsics helps for both
+ * stack item swap and float comparison.
+ */
+ccl_device_inline void qbvh_item_swap(QBVHStackItem *__restrict a,
+                                      QBVHStackItem *__restrict b)
+{
+	QBVHStackItem tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
+                                       QBVHStackItem *__restrict s2,
+                                       QBVHStackItem *__restrict s3)
+{
+	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+	if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
+	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+}
+
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
+                                       QBVHStackItem *__restrict s2,
+                                       QBVHStackItem *__restrict s3,
+                                       QBVHStackItem *__restrict s4)
+{
+	if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
+	if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); }
+	if(s3->dist < s1->dist) { qbvh_item_swap(s3, s1); }
+	if(s4->dist < s2->dist) { qbvh_item_swap(s4, s2); }
+	if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
+}
+
+ccl_device_inline int qbvh_node_intersect(KernelGlobals *__restrict kg,
+                                          const ssef& tnear,
+                                          const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+                                          const sse3f& org_idir,
+#else
+                                          const sse3f& org,
+#endif
+                                          const sse3f& idir,
+                                          const int near_x,
+                                          const int near_y,
+                                          const int near_z,
+                                          const int far_x,
+                                          const int far_y,
+                                          const int far_z,
+                                          const int nodeAddr,
+                                          ssef *__restrict dist)
+{
+	const int offset = nodeAddr*BVH_QNODE_SIZE;
+#ifdef __KERNEL_AVX2__
+	const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, org_idir.x);
+	const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, org_idir.y);
+	const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, org_idir.z);
+	const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, org_idir.x);
+	const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, org_idir.y);
+	const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, org_idir.z);
+#else
+	const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - org.x) * idir.x;
+	const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - org.y) * idir.y;
+	const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - org.z) * idir.z;
+	const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - org.x) * idir.x;
+	const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - org.y) * idir.y;
+	const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - org.z) * idir.z;
+#endif
+
+#ifdef __KERNEL_SSE41__
+	const ssef tNear = maxi(maxi(tnear_x, tnear_y), maxi(tnear_z, tnear));
+	const ssef tFar = mini(mini(tfar_x, tfar_y), mini(tfar_z, tfar));
+	const sseb vmask = cast(tNear) > cast(tFar);
+	int mask = (int)movemask(vmask)^0xf;
+#else
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	const sseb vmask = tNear <= tFar;
+	int mask = (int)movemask(vmask);
+#endif
+	*dist = tNear;
+	return mask;
+}
+
+ccl_device_inline int qbvh_node_intersect_robust(KernelGlobals *__restrict kg,
+                                                 const ssef& tnear,
+                                                 const ssef& tfar,
+#ifdef __KERNEL_AVX2__
+                                                 const sse3f& P_idir,
+#else
+                                                 const sse3f& P,
+#endif
+                                                 const sse3f& idir,
+                                                 const int near_x,
+                                                 const int near_y,
+                                                 const int near_z,
+                                                 const int far_x,
+                                                 const int far_y,
+                                                 const int far_z,
+                                                 const int nodeAddr,
+                                                 const float difl,
+                                                 ssef *__restrict dist)
+{
+	const int offset = nodeAddr*BVH_QNODE_SIZE;
+#ifdef __KERNEL_AVX2__
+	const ssef tnear_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x), idir.x, P_idir.x);
+	const ssef tnear_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y), idir.y, P_idir.y);
+	const ssef tnear_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z), idir.z, P_idir.z);
+	const ssef tfar_x = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x), idir.x, P_idir.x);
+	const ssef tfar_y = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y), idir.y, P_idir.y);
+	const ssef tfar_z = msub(kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z), idir.z, P_idir.z);
+#else
+	const ssef tnear_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_x) - P.x) * idir.x;
+	const ssef tnear_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_y) - P.y) * idir.y;
+	const ssef tnear_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+near_z) - P.z) * idir.z;
+	const ssef tfar_x = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_x) - P.x) * idir.x;
+	const ssef tfar_y = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_y) - P.y) * idir.y;
+	const ssef tfar_z = (kernel_tex_fetch_ssef(__bvh_nodes, offset+far_z) - P.z) * idir.z;
+#endif
+
+	const float round_down = 1.0f - difl;
+	const float round_up = 1.0f + difl;
+	const ssef tNear = max4(tnear_x, tnear_y, tnear_z, tnear);
+	const ssef tFar = min4(tfar_x, tfar_y, tfar_z, tfar);
+	const sseb vmask = round_down*tNear <= round_up*tFar;
+	*dist = tNear;
+	return (int)movemask(vmask);
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_shadow.h b/intern/cycles/kernel/geom/geom_qbvh_shadow.h
new file mode 100644
index 00000000000..dc37e6ecfa4
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_shadow.h
@@ -0,0 +1,403 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             const uint max_hits,
+                                             uint *num_hits)
+{
+	/* TODO(sergey):
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+	*num_hits = 0;
+	isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	ssef tnear(0.0f), tfar(tmax);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				ssef dist;
+				int traverseChild = qbvh_node_intersect(kg,
+				                                        tnear,
+				                                        tfar,
+#ifdef __KERNEL_AVX2__
+				                                        P_idir4,
+#else
+				                                        org,
+#endif
+				                                        idir4,
+				                                        near_x, near_y, near_z,
+				                                        far_x, far_y, far_z,
+				                                        nodeAddr,
+				                                        &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
+#ifdef __VISIBILITY_FLAG__
+				if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+					continue;
+				}
+#endif
+
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					while(primAddr < primAddr2) {
+						kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+
+						bool hit;
+
+						/* todo: specialized intersect functions which don't fill in
+						 * isect unless needed and check SD_HAS_TRANSPARENT_SHADOW?
+						 * might give a few % performance improvement */
+
+						switch(p_type) {
+							case PRIMITIVE_TRIANGLE: {
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, PATH_RAY_SHADOW, object, primAddr);
+								break;
+							}
+#if BVH_FEATURE(BVH_MOTION)
+							case PRIMITIVE_MOTION_TRIANGLE: {
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, PATH_RAY_SHADOW, object, primAddr);
+								break;
+							}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+							case PRIMITIVE_CURVE:
+							case PRIMITIVE_MOTION_CURVE: {
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
+									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									hit = bvh_curve_intersect(kg, isect_array, P, dir, PATH_RAY_SHADOW, object, primAddr, ray->time, type, NULL, 0, 0);
+								break;
+							}
+#endif
+							default: {
+								hit = false;
+								break;
+							}
+						}
+
+						/* Shadow ray early termination. */
+						if(hit) {
+							/* detect if this surface has a shader with transparent shadows */
+
+							/* todo: optimize so primitive visibility flag indicates if
+							 * the primitive has a transparent shadow shader? */
+							int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
+							int shader = 0;
+
+#ifdef __HAIR__
+							if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+							{
+								shader = kernel_tex_fetch(__tri_shader, prim);
+							}
+#ifdef __HAIR__
+							else {
+								float4 str = kernel_tex_fetch(__curves, prim);
+								shader = __float_as_int(str.z);
+							}
+#endif
+							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
+
+							/* if no transparent shadows, all light is blocked */
+							if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+								return true;
+							}
+							/* if maximum number of hits reached, block all light */
+							else if(*num_hits == max_hits) {
+								return true;
+							}
+
+							/* move on to next entry in intersections array */
+							isect_array++;
+							(*num_hits)++;
+#if BVH_FEATURE(BVH_INSTANCING)
+							num_hits_in_instance++;
+#endif
+
+							isect_array->t = isect_t;
+						}
+
+						primAddr++;
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+
+#if BVH_FEATURE(BVH_MOTION)
+					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+					num_hits_in_instance = 0;
+					isect_array->t = isect_t;
+
+					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+					tfar = ssef(isect_t);
+					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+					P_idir = P*idir;
+					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+					org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+					triangle_intersect_precalc(dir, &isect_precalc);
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+
+					nodeAddr = kernel_tex_fetch(__object_node, object);
+
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			if(num_hits_in_instance) {
+				float t_fac;
+
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm);
+#else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#endif
+
+				/* scale isect->t to adjust for instancing */
+				for(int i = 0; i < num_hits_in_instance; i++)
+					(isect_array-i-1)->t *= t_fac;
+			}
+			else {
+				float ignore_t = FLT_MAX;
+
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm);
+#else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#endif
+			}
+
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(tmax);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return false;
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_subsurface.h b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
new file mode 100644
index 00000000000..d85e1a4691e
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_subsurface.h
@@ -0,0 +1,326 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for subsurface scattering, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             int subsurface_object,
+                                             uint *lcg_state,
+                                             int max_hits)
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps (for non shadow rays).
+	 * - Separate version for shadow rays.
+	 * - Likely and unlikely for if() statements.
+	 * - SSE for hair.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = ray->t;
+	uint num_hits = 0;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return 0;
+	}
+#endif
+
+	ssef tnear(0.0f), tfar(isect_t);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				ssef dist;
+				int traverseChild = qbvh_node_intersect(kg,
+				                                        tnear,
+				                                        tfar,
+#ifdef __KERNEL_AVX2__
+				                                        P_idir4,
+#else
+				                                        org,
+#endif
+				                                        idir4,
+				                                        near_x, near_y, near_z,
+				                                        far_x, far_y, far_z,
+				                                        nodeAddr,
+				                                        &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							/* Intersect ray against primitive, */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from the same object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								if(tri_object != subsurface_object) {
+									continue;
+								}
+								triangle_intersect_subsurface(kg, &isect_precalc, isect_array, P, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							/* Intersect ray against primitive. */
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from the same object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								if(tri_object != subsurface_object) {
+									continue;
+								}
+								motion_triangle_intersect_subsurface(kg, isect_array, P, dir, ray->time, object, primAddr, isect_t, &num_hits, lcg_state, max_hits);
+							}
+							break;
+						}
+#endif
+						default:
+							break;
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					if(subsurface_object == kernel_tex_fetch(__prim_object, -primAddr-1)) {
+						object = subsurface_object;
+
+#if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						tfar = ssef(isect_t);
+						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+						org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+						triangle_intersect_precalc(dir, &isect_precalc);
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+					}
+
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect_t);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return num_hits;
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_traversal.h b/intern/cycles/kernel/geom/geom_qbvh_traversal.h
new file mode 100644
index 00000000000..7e356ea062b
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_traversal.h
@@ -0,0 +1,425 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function, where various features can be
+ * enabled/disabled. This way we can compile optimized versions for each case
+ * without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_HAIR_MINIMUM_WIDTH: hair curve rendering with minimum width
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect,
+                                             const uint visibility
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+                                             ,uint *lcg_state,
+                                             float difl,
+                                             float extmax
+#endif
+                                             )
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps (for non shadow rays).
+	 * - Separate version for shadow rays.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+	traversalStack[0].dist = -FLT_MAX;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+	float nodeDist = -FLT_MAX;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+#if defined(__KERNEL_DEBUG__)
+	isect->num_traversal_steps = 0;
+	isect->num_traversed_instances = 0;
+#endif
+
+	ssef tnear(0.0f), tfar(ray->t);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				if(UNLIKELY(nodeDist > isect->t)) {
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					nodeDist = traversalStack[stackPtr].dist;
+					--stackPtr;
+					continue;
+				}
+
+				int traverseChild;
+				ssef dist;
+
+#if defined(__KERNEL_DEBUG__)
+				isect->num_traversal_steps++;
+#endif
+
+#if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
+				if(difl != 0.0f) {
+					/* NOTE: We extend all the child BB instead of fetching
+					 * and checking visibility flags for each of the,
+					 *
+					 * Need to test if doing opposite would be any faster.
+					 */
+					traverseChild = qbvh_node_intersect_robust(kg,
+					                                           tnear,
+					                                           tfar,
+#ifdef __KERNEL_AVX2__
+					                                           P_idir4,
+#else
+					                                           org,
+#endif
+					                                           idir4,
+					                                           near_x, near_y, near_z,
+					                                           far_x, far_y, far_z,
+					                                           nodeAddr,
+					                                           difl,
+					                                           &dist);
+				}
+				else
+#endif
+				{
+					traverseChild = qbvh_node_intersect(kg,
+					                                    tnear,
+					                                    tfar,
+#ifdef __KERNEL_AVX2__
+					                                    P_idir4,
+#else
+					                                    org,
+#endif
+					                                    idir4,
+					                                    near_x, near_y, near_z,
+					                                    far_x, far_y, far_z,
+					                                    nodeAddr,
+					                                    &dist);
+				}
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					float d0 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						nodeDist = d0;
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							nodeDist = d1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							nodeDist = d0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						nodeDist = traversalStack[stackPtr].dist;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				nodeDist = traversalStack[stackPtr].dist;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
+
+#ifdef __VISIBILITY_FLAG__
+				if(UNLIKELY((nodeDist > isect->t) || ((__float_as_uint(leaf.z) & visibility) == 0)))
+#else
+				if(UNLIKELY((nodeDist > isect->t)))
+#endif
+				{
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					nodeDist = traversalStack[stackPtr].dist;
+					--stackPtr;
+					continue;
+				}
+
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					nodeDist = traversalStack[stackPtr].dist;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					switch(type & PRIMITIVE_ALL) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+#if defined(__KERNEL_DEBUG__)
+								isect->num_traversal_steps++;
+#endif
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) {
+									tfar = ssef(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+#if defined(__KERNEL_DEBUG__)
+								isect->num_traversal_steps++;
+#endif
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) {
+									tfar = ssef(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_MOTION) */
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							for(; primAddr < primAddr2; primAddr++) {
+#if defined(__KERNEL_DEBUG__)
+								isect->num_traversal_steps++;
+#endif
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								bool hit;
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									hit = bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
+								else
+									hit = bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, lcg_state, difl, extmax);
+								if(hit) {
+									tfar = ssef(isect->t);
+									/* Shadow ray early termination. */
+									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+										return true;
+								}
+							}
+							break;
+						}
+#endif  /* BVH_FEATURE(BVH_HAIR) */
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+
+#if BVH_FEATURE(BVH_MOTION)
+					qbvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist, &ob_tfm);
+#else
+					qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &nodeDist);
+#endif
+
+					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+					tfar = ssef(isect->t);
+					idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+					P_idir = P*idir;
+					P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+					org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+					triangle_intersect_precalc(dir, &isect_precalc);
+
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+					traversalStack[stackPtr].dist = -FLT_MAX;
+
+					nodeAddr = kernel_tex_fetch(__object_node, object);
+
+#if defined(__KERNEL_DEBUG__)
+					isect->num_traversed_instances++;
+#endif
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect->t);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			nodeDist = traversalStack[stackPtr].dist;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume.h b/intern/cycles/kernel/geom/geom_qbvh_volume.h
new file mode 100644
index 00000000000..d8cfa3a4061
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_volume.h
@@ -0,0 +1,351 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect)
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+	const uint visibility = PATH_RAY_ALL_VISIBILITY;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+	ssef tnear(0.0f), tfar(ray->t);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				ssef dist;
+				int traverseChild = qbvh_node_intersect(kg,
+				                                        tnear,
+				                                        tfar,
+#ifdef __KERNEL_AVX2__
+				                                        P_idir4,
+#else
+				                                        org,
+#endif
+				                                        idir4,
+				                                        near_x, near_y, near_z,
+				                                        far_x, far_y, far_z,
+				                                        nodeAddr,
+				                                        &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					switch(p_type) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr);
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
+							}
+							break;
+						}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+							}
+							break;
+						}
+#endif
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						tfar = ssef(isect->t);
+						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+						org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+						triangle_intersect_precalc(dir, &isect_precalc);
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+#if BVH_FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect->t);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
diff --git a/intern/cycles/kernel/geom/geom_qbvh_volume_all.h b/intern/cycles/kernel/geom/geom_qbvh_volume_all.h
new file mode 100644
index 00000000000..056ca9a1ad9
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_qbvh_volume_all.h
@@ -0,0 +1,446 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
+                                             const Ray *ray,
+                                             Intersection *isect_array,
+                                             const uint max_hits)
+{
+	/* TODO(sergey):
+	 * - Test if pushing distance on the stack helps.
+	 * - Likely and unlikely for if() statements.
+	 * - Test restrict attribute for pointers.
+	 */
+
+	/* Traversal stack in CUDA thread-local memory. */
+	QBVHStackItem traversalStack[BVH_QSTACK_SIZE];
+	traversalStack[0].addr = ENTRYPOINT_SENTINEL;
+
+	/* Traversal variables in registers. */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* Ray parameters in registers. */
+	const float tmax = ray->t;
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+	float isect_t = tmax;
+
+	const uint visibility = PATH_RAY_ALL_VISIBILITY;
+
+#if BVH_FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+	uint num_hits = 0;
+	isect_array->t = tmax;
+
+#ifndef __KERNEL_SSE41__
+	if(!isfinite(P.x)) {
+		return false;
+	}
+#endif
+
+#if BVH_FEATURE(BVH_INSTANCING)
+	int num_hits_in_instance = 0;
+#endif
+
+	ssef tnear(0.0f), tfar(isect_t);
+	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+
+#ifdef __KERNEL_AVX2__
+	float3 P_idir = P*idir;
+	sse3f P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+	sse3f org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+
+	/* Offsets to select the side that becomes the lower or upper bound. */
+	int near_x, near_y, near_z;
+	int far_x, far_y, far_z;
+
+	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+
+	IsectPrecalc isect_precalc;
+	triangle_intersect_precalc(dir, &isect_precalc);
+
+	/* Traversal loop. */
+	do {
+		do {
+			/* Traverse internal nodes. */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				ssef dist;
+				int traverseChild = qbvh_node_intersect(kg,
+				                                        tnear,
+				                                        tfar,
+#ifdef __KERNEL_AVX2__
+				                                        P_idir4,
+#else
+				                                        org,
+#endif
+				                                        idir4,
+				                                        near_x, near_y, near_z,
+				                                        far_x, far_y, far_z,
+				                                        nodeAddr,
+				                                        &dist);
+
+				if(traverseChild != 0) {
+					float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_QNODE_SIZE+6);
+
+					/* One child is hit, continue with that child. */
+					int r = __bscf(traverseChild);
+					if(traverseChild == 0) {
+						nodeAddr = __float_as_int(cnodes[r]);
+						continue;
+					}
+
+					/* Two children are hit, push far child, and continue with
+					 * closer child.
+					 */
+					int c0 = __float_as_int(cnodes[r]);
+					float d0 = ((float*)&dist)[r];
+					r = __bscf(traverseChild);
+					int c1 = __float_as_int(cnodes[r]);
+					float d1 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						if(d1 < d0) {
+							nodeAddr = c1;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c0;
+							traversalStack[stackPtr].dist = d0;
+							continue;
+						}
+						else {
+							nodeAddr = c0;
+							++stackPtr;
+							kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+							traversalStack[stackPtr].addr = c1;
+							traversalStack[stackPtr].dist = d1;
+							continue;
+						}
+					}
+
+					/* Here starts the slow path for 3 or 4 hit children. We push
+					 * all nodes onto the stack to sort them there.
+					 */
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c1;
+					traversalStack[stackPtr].dist = d1;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c0;
+					traversalStack[stackPtr].dist = d0;
+
+					/* Three children are hit, push all onto stack and sort 3
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c2 = __float_as_int(cnodes[r]);
+					float d2 = ((float*)&dist)[r];
+					if(traverseChild == 0) {
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = c2;
+						traversalStack[stackPtr].dist = d2;
+						qbvh_stack_sort(&traversalStack[stackPtr],
+						                &traversalStack[stackPtr - 1],
+						                &traversalStack[stackPtr - 2]);
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+						continue;
+					}
+
+					/* Four children are hit, push all onto stack and sort 4
+					 * stack items, continue with closest child.
+					 */
+					r = __bscf(traverseChild);
+					int c3 = __float_as_int(cnodes[r]);
+					float d3 = ((float*)&dist)[r];
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c3;
+					traversalStack[stackPtr].dist = d3;
+					++stackPtr;
+					kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+					traversalStack[stackPtr].addr = c2;
+					traversalStack[stackPtr].dist = d2;
+					qbvh_stack_sort(&traversalStack[stackPtr],
+					                &traversalStack[stackPtr - 1],
+					                &traversalStack[stackPtr - 2],
+					                &traversalStack[stackPtr - 3]);
+				}
+
+				nodeAddr = traversalStack[stackPtr].addr;
+				--stackPtr;
+			}
+
+			/* If node is leaf, fetch triangle list. */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-nodeAddr-1)*BVH_QNODE_LEAF_SIZE);
+				int primAddr = __float_as_int(leaf.x);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+					const uint type = __float_as_int(leaf.w);
+					const uint p_type = type & PRIMITIVE_ALL;
+					bool hit;
+
+					/* Pop. */
+					nodeAddr = traversalStack[stackPtr].addr;
+					--stackPtr;
+
+					/* Primitive intersection. */
+					switch(p_type) {
+						case PRIMITIVE_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#if BVH_FEATURE(BVH_MOTION)
+						case PRIMITIVE_MOTION_TRIANGLE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								hit = motion_triangle_intersect(kg, isect_array, P, dir, ray->time, visibility, object, primAddr);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#  else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+#if BVH_FEATURE(BVH_HAIR)
+						case PRIMITIVE_CURVE:
+						case PRIMITIVE_MOTION_CURVE: {
+							for(; primAddr < primAddr2; primAddr++) {
+								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
+								/* Only primitives from volume object. */
+								uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+								int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+								if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+									continue;
+								}
+								/* Intersect ray against primitive. */
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
+									hit = bvh_cardinal_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									hit = bvh_curve_intersect(kg, isect_array, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								if(hit) {
+									/* Move on to next entry in intersections array. */
+									isect_array++;
+									num_hits++;
+#if BVH_FEATURE(BVH_INSTANCING)
+									num_hits_in_instance++;
+#endif
+									isect_array->t = isect_t;
+									if(num_hits == max_hits) {
+#if BVH_FEATURE(BVH_INSTANCING)
+#  if BVH_FEATURE(BVH_MOTION)
+										float t_fac = len(transform_direction(&ob_tfm, 1.0f/idir));
+#  else
+										Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+										float t_fac = len(transform_direction(&tfm, 1.0f/idir));
+#endif
+										for(int i = 0; i < num_hits_in_instance; i++) {
+											(isect_array-i-1)->t *= t_fac;
+										}
+#endif  /* BVH_FEATURE(BVH_INSTANCING) */
+										return num_hits;
+									}
+								}
+							}
+							break;
+						}
+#endif
+					}
+				}
+#if BVH_FEATURE(BVH_INSTANCING)
+				else {
+					/* Instance push. */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#if BVH_FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+#endif
+
+						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						tfar = ssef(isect_t);
+						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+						P_idir = P*idir;
+						P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+						org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+						triangle_intersect_precalc(dir, &isect_precalc);
+						num_hits_in_instance = 0;
+						isect_array->t = isect_t;
+
+						++stackPtr;
+						kernel_assert(stackPtr < BVH_QSTACK_SIZE);
+						traversalStack[stackPtr].addr = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* Pop. */
+						object = OBJECT_NONE;
+						nodeAddr = traversalStack[stackPtr].addr;
+						--stackPtr;
+					}
+				}
+			}
+#endif  /* FEATURE(BVH_INSTANCING) */
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if BVH_FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* Instance pop. */
+			if(num_hits_in_instance) {
+				float t_fac;
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_tfm);
+#else
+				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+#endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+				/* Scale isect->t to adjust for instancing. */
+				for(int i = 0; i < num_hits_in_instance; i++) {
+					(isect_array-i-1)->t *= t_fac;
+				}
+			}
+			else {
+				float ignore_t = FLT_MAX;
+#if BVH_FEATURE(BVH_MOTION)
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_tfm);
+#else
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+#endif
+				triangle_intersect_precalc(dir, &isect_precalc);
+			}
+
+			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
+			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
+			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			tfar = ssef(isect_t);
+			idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
+#ifdef __KERNEL_AVX2__
+			P_idir = P*idir;
+			P_idir4 = sse3f(P_idir.x, P_idir.y, P_idir.z);
+#else
+			org = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
+#endif
+			triangle_intersect_precalc(dir, &isect_precalc);
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr].addr;
+			--stackPtr;
+		}
+#endif  /* FEATURE(BVH_INSTANCING) */
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return num_hits;
+}
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 3d3a5e72485..995dfac5b09 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -23,111 +23,18 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Refine triangle intersection to more precise hit point. For rays that travel
- * far the precision is often not so good, this reintersects the primitive from
- * a closer distance. */
-
-ccl_device_inline float3 triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
-{
-	float3 P = ray->P;
-	float3 D = ray->D;
-	float t = isect->t;
-
-#ifdef __INTERSECTION_REFINE__
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D*t);
-		D = normalize_len(D, &t);
-	}
-
-	P = P + D*t;
-
-	float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
-	float rt = Oz * invDz;
-
-	P = P + D*rt;
-
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-#else
-	return P + D*t;
-#endif
-}
-
-/* same as above, except that isect->t is assumed to be in object space for instancing */
-ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
-{
-	float3 P = ray->P;
-	float3 D = ray->D;
-	float t = isect->t;
-
-#ifdef __INTERSECTION_REFINE__
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D);
-		D = normalize(D);
-	}
-
-	P = P + D*t;
-
-	float4 v00 = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0);
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(D.x*v00.x + D.y*v00.y + D.z*v00.z);
-	float rt = Oz * invDz;
-
-	P = P + D*rt;
-
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-#else
-	return P + D*t;
-#endif
-}
-
 /* normal on triangle  */
 ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 {
 	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
 	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
 	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
 	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
 	
 	/* return normal */
-	if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+	if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
 		return normalize(cross(v2 - v0, v1 - v0));
 	else
 		return normalize(cross(v1 - v0, v2 - v0));
@@ -137,7 +44,7 @@ ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
 {
 	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
 
 	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
 	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
@@ -164,7 +71,7 @@ ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int
 
 ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3])
 {
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
 
 	P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
 	P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
@@ -176,7 +83,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
 ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
 {
 	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
 
 	float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x)));
 	float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y)));
@@ -187,10 +94,10 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
 
 /* Ray differentials on triangle */
 
-ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, float3 *dPdu, float3 *dPdv)
+ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, ccl_addr_space float3 *dPdu, ccl_addr_space float3 *dPdv)
 {
 	/* fetch triangle vertex coordinates */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
 
 	float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
 	float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
@@ -209,34 +116,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
 		if(dx) *dx = 0.0f;
 		if(dy) *dy = 0.0f;
 
-		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
+		return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim));
 	}
 	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
-		float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
 		float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x));
 		float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y));
 		float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else if(elem == ATTR_ELEMENT_CORNER) {
-		int tri = offset + sd->prim*3;
+		int tri = offset + ccl_fetch(sd, prim)*3;
 		float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
 		float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
 		float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else {
 		if(dx) *dx = 0.0f;
@@ -252,24 +159,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim)));
 	}
 	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
-		float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
 		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else if(elem == ATTR_ELEMENT_CORNER || elem == ATTR_ELEMENT_CORNER_BYTE) {
-		int tri = offset + sd->prim*3;
+		int tri = offset + ccl_fetch(sd, prim)*3;
 		float3 f0, f1, f2;
 
 		if(elem == ATTR_ELEMENT_CORNER) {
@@ -284,11 +191,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		}
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
 	}
 	else {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
@@ -298,116 +205,4 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 	}
 }
 
-/* Ray-Triangle intersection for BVH traversal
- *
- * Based on Sven Woop's algorithm with precomputed triangle storage */
-
-ccl_device_inline bool triangle_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 dir, uint visibility, int object, int triAddr)
-{
-	/* compute and check intersection t-value */
-	float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
-	float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
-
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
-	float t = Oz * invDz;
-
-	if(t > 0.0f && t < isect->t) {
-		/* compute and check barycentric u */
-		float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
-		float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
-		float u = Ox + t*Dx;
-
-		if(u >= 0.0f) {
-			/* compute and check barycentric v */
-			float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
-			float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
-			float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
-			float v = Oy + t*Dy;
-
-			if(v >= 0.0f && u + v <= 1.0f) {
-#ifdef __VISIBILITY_FLAG__
-				/* visibility flag test. we do it here under the assumption
-				 * that most triangles are culled by node flags */
-				if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
-#endif
-				{
-					/* record intersection */
-					isect->prim = triAddr;
-					isect->object = object;
-					isect->type = PRIMITIVE_TRIANGLE;
-					isect->u = u;
-					isect->v = v;
-					isect->t = t;
-					return true;
-				}
-			}
-		}
-	}
-
-	return false;
-}
-
-/* Special ray intersection routines for subsurface scattering. In that case we
- * only want to intersect with primitives in the same object, and if case of
- * multiple hits we pick a single random primitive as the intersection point. */
-
-#ifdef __SUBSURFACE__
-ccl_device_inline void triangle_intersect_subsurface(KernelGlobals *kg, Intersection *isect_array,
-	float3 P, float3 dir, int object, int triAddr, float tmax, uint *num_hits, uint *lcg_state, int max_hits)
-{
-	/* compute and check intersection t-value */
-	float4 v00 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0);
-	float4 v11 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1);
-
-	float Oz = v00.w - P.x*v00.x - P.y*v00.y - P.z*v00.z;
-	float invDz = 1.0f/(dir.x*v00.x + dir.y*v00.y + dir.z*v00.z);
-	float t = Oz * invDz;
-
-	if(t > 0.0f && t < tmax) {
-		/* compute and check barycentric u */
-		float Ox = v11.w + P.x*v11.x + P.y*v11.y + P.z*v11.z;
-		float Dx = dir.x*v11.x + dir.y*v11.y + dir.z*v11.z;
-		float u = Ox + t*Dx;
-
-		if(u >= 0.0f) {
-			/* compute and check barycentric v */
-			float4 v22 = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
-			float Oy = v22.w + P.x*v22.x + P.y*v22.y + P.z*v22.z;
-			float Dy = dir.x*v22.x + dir.y*v22.y + dir.z*v22.z;
-			float v = Oy + t*Dy;
-
-			if(v >= 0.0f && u + v <= 1.0f) {
-				(*num_hits)++;
-
-				int hit;
-
-				if(*num_hits <= max_hits) {
-					hit = *num_hits - 1;
-				}
-				else {
-					/* reservoir sampling: if we are at the maximum number of
-					 * hits, randomly replace element or skip it */
-					hit = lcg_step_uint(lcg_state) % *num_hits;
-
-					if(hit >= max_hits)
-						return;
-				}
-
-				/* record intersection */
-				Intersection *isect = &isect_array[hit];
-				isect->prim = triAddr;
-				isect->object = object;
-				isect->type = PRIMITIVE_TRIANGLE;
-				isect->u = u;
-				isect->v = v;
-				isect->t = t;
-			}
-		}
-	}
-}
-#endif
-
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
new file mode 100644
index 00000000000..ba309a1dc53
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -0,0 +1,431 @@
+/*
+ * Copyright 2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Triangle/Ray intersections.
+ *
+ * For BVH ray intersection we use a precomputed triangle storage to accelerate
+ * intersection at the cost of more memory usage.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Workaround stupidness of CUDA/OpenCL which doesn't allow to access indexed
+ * component of float3 value.
+ */
+#ifndef __KERNEL_CPU__
+#  define IDX(vec, idx) \
+    ((idx == 0) ? ((vec).x) : ( (idx == 1) ? ((vec).y) : ((vec).z) ))
+#else
+#  define IDX(vec, idx) ((vec)[idx])
+#endif
+
+/* Ray-Triangle intersection for BVH traversal
+ *
+ * Sven Woop
+ * Watertight Ray/Triangle Intersection
+ *
+ * http://jcgt.org/published/0002/01/05/paper.pdf
+ */
+
+/* Precalculated data for the ray->tri intersection. */
+typedef struct IsectPrecalc {
+	/* Maximal dimension kz, and orthogonal dimensions. */
+	int kx, ky, kz;
+
+	/* Shear constants. */
+	float Sx, Sy, Sz;
+} IsectPrecalc;
+
+#if defined(__KERNEL_CUDA__)
+#  if (defined(i386) || defined(_M_IX86))
+#    if __CUDA_ARCH__ > 500
+ccl_device_noinline
+#    else  /* __CUDA_ARCH__ > 500 */
+ccl_device_inline
+#    endif  /* __CUDA_ARCH__ > 500 */
+#  else  /* (defined(i386) || defined(_M_IX86)) */
+#    if defined(__KERNEL_EXPERIMENTAL__) && (__CUDA_ARCH__ >= 500)
+ccl_device_noinline
+#    else
+ccl_device_inline
+#    endif
+#  endif  /* (defined(i386) || defined(_M_IX86)) */
+#elif defined(__KERNEL_OPENCL_APPLE__)
+ccl_device_noinline
+#else  /* defined(__KERNEL_OPENCL_APPLE__) */
+ccl_device_inline
+#endif  /* defined(__KERNEL_OPENCL_APPLE__) */
+void triangle_intersect_precalc(float3 dir,
+                                IsectPrecalc *isect_precalc)
+{
+	/* Calculate dimension where the ray direction is maximal. */
+	int kz = util_max_axis(make_float3(fabsf(dir.x),
+	                                   fabsf(dir.y),
+	                                   fabsf(dir.z)));
+	int kx = kz + 1; if(kx == 3) kx = 0;
+	int ky = kx + 1; if(ky == 3) ky = 0;
+
+	/* Swap kx and ky dimensions to preserve winding direction of triangles. */
+	if(IDX(dir, kz) < 0.0f) {
+		int tmp = kx;
+		kx = ky;
+		ky = tmp;
+	}
+
+	/* Calculate the shear constants. */
+	float inv_dir_z = 1.0f / IDX(dir, kz);
+	isect_precalc->Sx = IDX(dir, kx) * inv_dir_z;
+	isect_precalc->Sy = IDX(dir, ky) * inv_dir_z;
+	isect_precalc->Sz = inv_dir_z;
+
+	/* Store the dimensions. */
+	isect_precalc->kx = kx;
+	isect_precalc->ky = ky;
+	isect_precalc->kz = kz;
+}
+
+/* TODO(sergey): Make it general utility function. */
+ccl_device_inline float xor_signmask(float x, int y)
+{
+	return __int_as_float(__float_as_int(x) ^ y);
+}
+
+ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
+                                          const IsectPrecalc *isect_precalc,
+                                          Intersection *isect,
+                                          float3 P,
+                                          uint visibility,
+                                          int object,
+                                          int triAddr)
+{
+	const int kx = isect_precalc->kx;
+	const int ky = isect_precalc->ky;
+	const int kz = isect_precalc->kz;
+	const float Sx = isect_precalc->Sx;
+	const float Sy = isect_precalc->Sy;
+	const float Sz = isect_precalc->Sz;
+
+	/* Calculate vertices relative to ray origin. */
+	const float4 tri_a = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0),
+	             tri_b = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1),
+	             tri_c = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
+	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
+	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
+	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
+
+	const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
+	const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);
+	const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz);
+
+	/* Perform shear and scale of vertices. */
+	const float Ax = A_kx - Sx * A_kz;
+	const float Ay = A_ky - Sy * A_kz;
+	const float Bx = B_kx - Sx * B_kz;
+	const float By = B_ky - Sy * B_kz;
+	const float Cx = C_kx - Sx * C_kz;
+	const float Cy = C_ky - Sy * C_kz;
+
+	/* Calculate scaled barycentric coordinates. */
+	float U = Cx * By - Cy * Bx;
+	float V = Ax * Cy - Ay * Cx;
+	float W = Bx * Ay - By * Ax;
+	const int sign_mask = (__float_as_int(U) & 0x80000000);
+	/* TODO(sergey): Check if multiplication plus sign check is faster
+	 * or at least same speed (but robust for endian types).
+	 */
+	if(sign_mask != (__float_as_int(V) & 0x80000000) ||
+	   sign_mask != (__float_as_int(W) & 0x80000000))
+	{
+		return false;
+	}
+
+	/* Calculate determinant. */
+	float det = U + V + W;
+	if(UNLIKELY(det == 0.0f)) {
+		return false;
+	}
+
+	/* Calculate scaled z-coordinates of vertices and use them to calculate
+	 * the hit distance.
+	 */
+	const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
+	const float sign_T = xor_signmask(T, sign_mask);
+	if((sign_T < 0.0f) ||
+	   (sign_T > isect->t * xor_signmask(det, sign_mask)))
+	{
+		return false;
+	}
+
+#ifdef __VISIBILITY_FLAG__
+	/* visibility flag test. we do it here under the assumption
+	 * that most triangles are culled by node flags */
+	if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
+#endif
+	{
+#ifdef __KERNEL_GPU__
+		float4 a = tri_b - tri_a, b = tri_c - tri_a;
+		if(len_squared(make_float3(a.y*b.z - a.z*b.y,
+		                           a.z*b.x - a.x*b.z,
+		                           a.x*b.y - a.y*b.x)) == 0.0f)
+		{
+			return false;
+		}
+#endif
+
+		/* Normalize U, V, W, and T. */
+		const float inv_det = 1.0f / det;
+		isect->prim = triAddr;
+		isect->object = object;
+		isect->type = PRIMITIVE_TRIANGLE;
+		isect->u = U * inv_det;
+		isect->v = V * inv_det;
+		isect->t = T * inv_det;
+		return true;
+	}
+	return false;
+}
+
+/* Special ray intersection routines for subsurface scattering. In that case we
+ * only want to intersect with primitives in the same object, and if case of
+ * multiple hits we pick a single random primitive as the intersection point.
+ */
+
+#ifdef __SUBSURFACE__
+ccl_device_inline void triangle_intersect_subsurface(
+        KernelGlobals *kg,
+        const IsectPrecalc *isect_precalc,
+        Intersection *isect_array,
+        float3 P,
+        int object,
+        int triAddr,
+        float tmax,
+        uint *num_hits,
+        uint *lcg_state,
+        int max_hits)
+{
+	const int kx = isect_precalc->kx;
+	const int ky = isect_precalc->ky;
+	const int kz = isect_precalc->kz;
+	const float Sx = isect_precalc->Sx;
+	const float Sy = isect_precalc->Sy;
+	const float Sz = isect_precalc->Sz;
+
+	/* Calculate vertices relative to ray origin. */
+	const float4 tri_a = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+0),
+	             tri_b = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+1),
+	             tri_c = kernel_tex_fetch(__tri_woop, triAddr*TRI_NODE_SIZE+2);
+	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
+	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
+	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
+
+	const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
+	const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);
+	const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz);
+
+	/* Perform shear and scale of vertices. */
+	const float Ax = A_kx - Sx * A_kz;
+	const float Ay = A_ky - Sy * A_kz;
+	const float Bx = B_kx - Sx * B_kz;
+	const float By = B_ky - Sy * B_kz;
+	const float Cx = C_kx - Sx * C_kz;
+	const float Cy = C_ky - Sy * C_kz;
+
+	/* Calculate scaled barycentric coordinates. */
+	float U = Cx * By - Cy * Bx;
+	int sign_mask = (__float_as_int(U) & 0x80000000);
+	float V = Ax * Cy - Ay * Cx;
+	if(sign_mask != (__float_as_int(V) & 0x80000000)) {
+		return;
+	}
+	float W = Bx * Ay - By * Ax;
+	if(sign_mask != (__float_as_int(W) & 0x80000000)) {
+		return;
+	}
+
+	/* Calculate determinant. */
+	float det = U + V + W;
+	if(UNLIKELY(det == 0.0f)) {
+		return;
+	}
+
+	/* Calculate scaled z−coordinates of vertices and use them to calculate
+	 * the hit distance.
+	 */
+	const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
+	const float sign_T = xor_signmask(T, sign_mask);
+	if((sign_T < 0.0f) ||
+	   (sign_T > tmax * xor_signmask(det, sign_mask)))
+	{
+		return;
+	}
+
+	/* Normalize U, V, W, and T. */
+	const float inv_det = 1.0f / det;
+
+	(*num_hits)++;
+	int hit;
+
+	if(*num_hits <= max_hits) {
+		hit = *num_hits - 1;
+	}
+	else {
+		/* reservoir sampling: if we are at the maximum number of
+		 * hits, randomly replace element or skip it */
+		hit = lcg_step_uint(lcg_state) % *num_hits;
+
+		if(hit >= max_hits)
+			return;
+	}
+
+	/* record intersection */
+	Intersection *isect = &isect_array[hit];
+	isect->prim = triAddr;
+	isect->object = object;
+	isect->type = PRIMITIVE_TRIANGLE;
+	isect->u = U * inv_det;
+	isect->v = V * inv_det;
+	isect->t = T * inv_det;
+}
+#endif
+
+/* Refine triangle intersection to more precise hit point. For rays that travel
+ * far the precision is often not so good, this reintersects the primitive from
+ * a closer distance. */
+
+/* Reintersections uses the paper:
+ *
+ * Tomas Moeller
+ * Fast, minimum storage ray/triangle intersection
+ * http://www.cs.virginia.edu/~gfx/Courses/2003/ImageSynthesis/papers/Acceleration/Fast%20MinimumStorage%20RayTriangle%20Intersection.pdf
+ */
+
+ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         const Intersection *isect,
+                                         const Ray *ray)
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#ifdef __INTERSECTION_REFINE__
+	if(isect->object != OBJECT_NONE) {
+		if(UNLIKELY(t == 0.0f)) {
+			return P;
+		}
+#ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_itfm);
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D*t);
+		D = normalize_len(D, &t);
+	}
+
+	P = P + D*t;
+
+	const float4 tri_a = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0),
+	             tri_b = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1),
+	             tri_c = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2);
+	float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
+	float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
+	float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
+	float3 qvec = cross(tvec, edge1);
+	float3 pvec = cross(D, edge2);
+	float rt = dot(edge2, qvec) / dot(edge1, pvec);
+
+	P = P + D*rt;
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_tfm);
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#else
+	return P + D*t;
+#endif
+}
+
+/* Same as above, except that isect->t is assumed to be in object space for
+ * instancing.
+ */
+ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
+                                                    ShaderData *sd,
+                                                    const Intersection *isect,
+                                                    const Ray *ray)
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#ifdef __INTERSECTION_REFINE__
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_itfm);
+#else
+		Transform tfm = object_fetch_transform(kg,
+		                                       isect->object,
+		                                       OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D);
+		D = normalize(D);
+	}
+
+	P = P + D*t;
+
+	const float4 tri_a = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+0),
+	             tri_b = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+1),
+	             tri_c = kernel_tex_fetch(__tri_woop, isect->prim*TRI_NODE_SIZE+2);
+	float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
+	float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
+	float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
+	float3 qvec = cross(tvec, edge1);
+	float3 pvec = cross(D, edge2);
+	float rt = dot(edge2, qvec) / dot(edge1, pvec);
+
+	P = P + D*rt;
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_tfm);
+#else
+		Transform tfm = object_fetch_transform(kg,
+		                                       isect->object,
+		                                       OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#else
+	return P + D*t;
+#endif
+}
+
+#undef IDX
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 33a20494966..c72afa2a3a4 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Volume Primitive
@@ -52,11 +52,15 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 #ifdef __KERNEL_GPU__
 	float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 #else
-	float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+	float4 r;
+	if(sd->flag & SD_VOLUME_CUBIC)
+		r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC);
+	else
+		r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
 #endif
 
 	if(dx) *dx = 0.0f;
-	if(dx) *dy = 0.0f;
+	if(dy) *dy = 0.0f;
 
 	/* todo: support float textures to lower memory usage for single floats */
 	return average(float4_to_float3(r));
@@ -68,7 +72,11 @@ ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *s
 #ifdef __KERNEL_GPU__
 	float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 #else
-	float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+	float4 r;
+	if(sd->flag & SD_VOLUME_CUBIC)
+		r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC);
+	else
+		r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
 #endif
 
 	if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 19e06b88797..b2596d10ee7 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __KERNEL_H__
@@ -32,7 +32,14 @@ void *kernel_osl_memory(KernelGlobals *kg);
 bool kernel_osl_use(KernelGlobals *kg);
 
 void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
-void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t width, size_t height, size_t depth, InterpolationType interpolation=INTERPOLATION_LINEAR);
+void kernel_tex_copy(KernelGlobals *kg,
+                     const char *name,
+                     device_ptr mem,
+                     size_t width,
+                     size_t height,
+                     size_t depth,
+                     InterpolationType interpolation=INTERPOLATION_LINEAR,
+                     ExtensionType extension = EXTENSION_REPEAT);
 
 void kernel_cpu_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
 	int sample, int x, int y, int offset, int stride);
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index b0efcdc66a7..2dc87fffcbc 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -176,7 +176,7 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 #endif
 }
 
-ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throughput,
+ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
 	BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label)
 {
 	float inverse_pdf = 1.0f/bsdf_pdf;
@@ -341,12 +341,12 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
 
 ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L)
 {
-	float3 L_sum, L_direct, L_indirect;
-	float clamp_direct = kernel_data.integrator.sample_clamp_direct;
-	float clamp_indirect = kernel_data.integrator.sample_clamp_indirect;
-
+	float3 L_sum;
 	/* Light Passes are used */
 #ifdef __PASSES__
+	float3 L_direct, L_indirect;
+	float clamp_direct = kernel_data.integrator.sample_clamp_direct;
+	float clamp_indirect = kernel_data.integrator.sample_clamp_indirect;
 	if(L->use_light_pass) {
 		path_radiance_sum_indirect(L);
 
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index a1ec080e3d3..2b305e5488d 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -11,11 +11,13 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
+#undef USE_BAKE_JITTER
+
 ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng,
                                    const bool is_combined, const bool is_ao, const bool is_sss, int sample)
 {
@@ -29,6 +31,13 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 	bool is_sss_sample = is_sss;
 
+	ray.P = sd->P + sd->Ng;
+	ray.D = -sd->Ng;
+	ray.t = FLT_MAX;
+#ifdef __CAMERA_MOTION__
+	ray.time = TIME_INVALID;
+#endif
+
 	/* init radiance */
 	path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
 
@@ -55,7 +64,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 		/* sample subsurface scattering */
 		if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
 			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
-			if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
+			if(kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
 				is_sss_sample = true;
 		}
 #endif
@@ -159,7 +168,8 @@ ccl_device bool is_light_pass(ShaderEvalType type)
 	}
 }
 
-#if 0
+/* this helps with AA but it's not the real solution as it does not AA the geometry
+ *  but it's better than nothing, thus committed */
 ccl_device_inline float bake_clamp_mirror_repeat(float u)
 {
 	/* use mirror repeat (like opengl texture) so that if the barycentric
@@ -170,7 +180,6 @@ ccl_device_inline float bake_clamp_mirror_repeat(float u)
 
 	return (((int)fu) & 1)? 1.0f - u: u;
 }
-#endif
 
 ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output,
                                      ShaderEvalType type, int i, int offset, int sample)
@@ -198,12 +207,16 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	int num_samples = kernel_data.integrator.aa_samples;
 
 	/* random number generator */
-	RNG rng = cmj_hash(offset + i, 0);
+	RNG rng = cmj_hash(offset + i, kernel_data.integrator.seed);
 
-#if 0
-	uint rng_state = cmj_hash(i, 0);
+#ifdef USE_BAKE_JITTER
 	float filter_x, filter_y;
-	path_rng_init(kg, &rng_state, sample, num_samples, &rng, 0, 0, &filter_x, &filter_y);
+	if(sample == 0) {
+		filter_x = filter_y = 0.5f;
+	}
+	else {
+		path_rng_2D(kg, &rng, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
+	}
 
 	/* subpixel u/v offset */
 	if(sample > 0) {
@@ -253,6 +266,10 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		/* data passes */
 		case SHADER_EVAL_NORMAL:
 		{
+			if((sd.flag & SD_HAS_BUMP)) {
+				shader_eval_surface(kg, &sd, 0.f, 0, SHADER_CONTEXT_MAIN);
+			}
+
 			/* compression: normal = (2 * color) - 1 */
 			out = sd.N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
 			break;
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index d1217ae0abc..2d531fdc96e 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -41,11 +41,34 @@ ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
 	return bokeh;
 }
 
-ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
 	/* create ray form raster position */
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
-	float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
+	float3 raster = make_float3(raster_x, raster_y, 0.0f);
+	float3 Pcamera = transform_perspective(&rastertocamera, raster);
+
+#ifdef __CAMERA_MOTION__
+	if(kernel_data.cam.have_perspective_motion) {
+		/* TODO(sergey): Currently we interpolate projected coordinate which
+		 * gives nice looking result and which is simple, but is in fact a bit
+		 * different comparing to constructing projective matrix from an
+		 * interpolated field of view.
+		 */
+		if(ray->time < 0.5f) {
+			Transform rastertocamera_pre = kernel_data.cam.perspective_motion.pre;
+			float3 Pcamera_pre =
+			        transform_perspective(&rastertocamera_pre, raster);
+			Pcamera = interp(Pcamera_pre, Pcamera, ray->time * 2.0f);
+		}
+		else {
+			Transform rastertocamera_post = kernel_data.cam.perspective_motion.post;
+			float3 Pcamera_post =
+			        transform_perspective(&rastertocamera_post, raster);
+			Pcamera = interp(Pcamera, Pcamera_post, (ray->time - 0.5f) * 2.0f);
+		}
+	}
+#endif
 
 	ray->P = make_float3(0.0f, 0.0f, 0.0f);
 	ray->D = Pcamera;
@@ -70,8 +93,18 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 	Transform cameratoworld = kernel_data.cam.cameratoworld;
 
 #ifdef __CAMERA_MOTION__
-	if(kernel_data.cam.have_motion)
-		transform_motion_interpolate(&cameratoworld, (const DecompMotionTransform*)&kernel_data.cam.motion, ray->time);
+	if(kernel_data.cam.have_motion) {
+#ifdef __KERNEL_OPENCL__
+		const MotionTransform tfm = kernel_data.cam.motion;
+		transform_motion_interpolate(&cameratoworld,
+		                             ((const DecompMotionTransform*)&tfm),
+		                             ray->time);
+#else
+		transform_motion_interpolate(&cameratoworld,
+		                             ((const DecompMotionTransform*)&kernel_data.cam.motion),
+		                             ray->time);
+#endif
+	}
 #endif
 
 	ray->P = transform_point(&cameratoworld, ray->P);
@@ -90,16 +123,17 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 
 #ifdef __CAMERA_CLIPPING__
 	/* clipping */
-	ray->P += kernel_data.cam.nearclip*ray->D;
-	ray->t = kernel_data.cam.cliplength;
+	float3 Pclip = normalize(Pcamera);
+	float z_inv = 1.0f / Pclip.z;
+	ray->P += kernel_data.cam.nearclip*ray->D * z_inv;
+	ray->t = kernel_data.cam.cliplength * z_inv;
 #else
 	ray->t = FLT_MAX;
 #endif
 }
 
 /* Orthographic Camera */
-
-ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
 	/* create ray form raster position */
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
@@ -129,8 +163,18 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl
 	Transform cameratoworld = kernel_data.cam.cameratoworld;
 
 #ifdef __CAMERA_MOTION__
-	if(kernel_data.cam.have_motion)
-		transform_motion_interpolate(&cameratoworld, (const DecompMotionTransform*)&kernel_data.cam.motion, ray->time);
+	if(kernel_data.cam.have_motion) {
+#ifdef __KERNEL_OPENCL__
+		const MotionTransform tfm = kernel_data.cam.motion;
+		transform_motion_interpolate(&cameratoworld,
+		                             (const DecompMotionTransform*)&tfm,
+		                             ray->time);
+#else
+		transform_motion_interpolate(&cameratoworld,
+		                             (const DecompMotionTransform*)&kernel_data.cam.motion,
+		                             ray->time);
+#endif
+	}
 #endif
 
 	ray->P = transform_point(&cameratoworld, ray->P);
@@ -155,7 +199,7 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl
 
 /* Panorama Camera */
 
-ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
 	Transform rastertocamera = kernel_data.cam.rastertocamera;
 	float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
@@ -203,8 +247,18 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float
 	Transform cameratoworld = kernel_data.cam.cameratoworld;
 
 #ifdef __CAMERA_MOTION__
-	if(kernel_data.cam.have_motion)
-		transform_motion_interpolate(&cameratoworld, (const DecompMotionTransform*)&kernel_data.cam.motion, ray->time);
+	if(kernel_data.cam.have_motion) {
+#ifdef __KERNEL_OPENCL__
+		const MotionTransform tfm = kernel_data.cam.motion;
+		transform_motion_interpolate(&cameratoworld,
+		                             (const DecompMotionTransform*)&tfm,
+		                             ray->time);
+#else
+		transform_motion_interpolate(&cameratoworld,
+		                             (const DecompMotionTransform*)&kernel_data.cam.motion,
+		                             ray->time);
+#endif
+	}
 #endif
 
 	ray->P = transform_point(&cameratoworld, ray->P);
@@ -215,18 +269,21 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float
 	/* ray differential */
 	ray->dP = differential3_zero();
 
+	Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
+	float3 Ddiff = normalize(transform_direction(&cameratoworld, panorama_to_direction(kg, Pcamera.x, Pcamera.y)));
+
 	Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x + 1.0f, raster_y, 0.0f));
-	ray->dD.dx = normalize(transform_direction(&cameratoworld, panorama_to_direction(kg, Pcamera.x, Pcamera.y))) - ray->D;
+	ray->dD.dx = normalize(transform_direction(&cameratoworld, panorama_to_direction(kg, Pcamera.x, Pcamera.y))) - Ddiff;
 
 	Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y + 1.0f, 0.0f));
-	ray->dD.dy = normalize(transform_direction(&cameratoworld, panorama_to_direction(kg, Pcamera.x, Pcamera.y))) - ray->D;
+	ray->dD.dy = normalize(transform_direction(&cameratoworld, panorama_to_direction(kg, Pcamera.x, Pcamera.y))) - Ddiff;
 #endif
 }
 
 /* Common */
 
 ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v,
-	float lens_u, float lens_v, float time, Ray *ray)
+	float lens_u, float lens_v, float time, ccl_addr_space Ray *ray)
 {
 	/* pixel filter */
 	int filter_table_offset = kernel_data.film.filter_table_offset;
@@ -303,7 +360,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 {
 	if(kernel_data.cam.type != CAMERA_PANORAMA) {
 		/* perspective / ortho */
-		if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
+		if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
 			P += camera_position(kg);
 
 		Transform tfm = kernel_data.cam.worldtondc;
@@ -313,7 +370,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 		/* panorama */
 		Transform tfm = kernel_data.cam.worldtocamera;
 
-		if(sd->object != OBJECT_NONE)
+		if(ccl_fetch(sd, object) != OBJECT_NONE)
 			P = normalize(transform_point(&tfm, P));
 		else
 			P = normalize(transform_direction(&tfm, P));
@@ -325,4 +382,3 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index c2aab93c87b..ed145b4a967 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __KERNEL_COMPAT_CPU_H__
@@ -19,12 +19,39 @@
 
 #define __KERNEL_CPU__
 
+/* Release kernel has too much false-positive maybe-uninitialized warnings,
+ * which makes it possible to miss actual warnings.
+ */
+#if defined(__GNUC__) && defined(NDEBUG)
+#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#  pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+
+/* Selective nodes compilation. */
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#endif
+
 #include "util_debug.h"
 #include "util_math.h"
 #include "util_simd.h"
 #include "util_half.h"
 #include "util_types.h"
 
+#define ccl_addr_space
+
+/* On x86_64, versions of glibc < 2.16 have an issue where expf is
+ * much slower than the double version.  This was fixed in glibc 2.16.
+ */
+#if !defined(__KERNEL_GPU__)  && defined(__x86_64__) && defined(__x86_64__) && \
+     defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && \
+     (__GLIBC__ <= 2 && __GLIBC_MINOR__ < 16)
+#  define expf(x) ((float)exp((double)(x)))
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* Assertions inside the kernel only work for the CPU device, so we wrap it in
@@ -43,7 +70,7 @@ template<typename T> struct texture  {
 		return data[index];
 	}
 
-#if 0
+#ifdef __KERNEL_SSE2__
 	ccl_always_inline ssef fetch_ssef(int index)
 	{
 		kernel_assert(index >= 0 && index < width);
@@ -62,6 +89,14 @@ template<typename T> struct texture  {
 };
 
 template<typename T> struct texture_image  {
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+	{ \
+		u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
+		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
+		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
+		u[3] = (1.0f / 6.0f) * t * t * t; \
+	} (void)0
+
 	ccl_always_inline float4 read(float4 r)
 	{
 		return r;
@@ -93,7 +128,7 @@ template<typename T> struct texture_image  {
 		return x - (float)i;
 	}
 
-	ccl_always_inline float4 interp(float x, float y, bool periodic = true)
+	ccl_always_inline float4 interp(float x, float y)
 	{
 		if(UNLIKELY(!data))
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -103,34 +138,47 @@ template<typename T> struct texture_image  {
 		if(interpolation == INTERPOLATION_CLOSEST) {
 			frac(x*(float)width, &ix);
 			frac(y*(float)height, &iy);
-			if(periodic) {
-				ix = wrap_periodic(ix, width);
-				iy = wrap_periodic(iy, height);
-
-			}
-			else {
-				ix = wrap_clamp(ix, width);
-				iy = wrap_clamp(iy, height);
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					break;
+				case EXTENSION_CLIP:
+					if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
 			}
 			return read(data[ix + iy*width]);
 		}
-		else {
+		else if(interpolation == INTERPOLATION_LINEAR) {
 			float tx = frac(x*(float)width - 0.5f, &ix);
 			float ty = frac(y*(float)height - 0.5f, &iy);
 
-			if(periodic) {
-				ix = wrap_periodic(ix, width);
-				iy = wrap_periodic(iy, height);
-
-				nix = wrap_periodic(ix+1, width);
-				niy = wrap_periodic(iy+1, height);
-			}
-			else {
-				ix = wrap_clamp(ix, width);
-				iy = wrap_clamp(iy, height);
-
-				nix = wrap_clamp(ix+1, width);
-				niy = wrap_clamp(iy+1, height);
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+					break;
+				case EXTENSION_CLIP:
+					if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
 			}
 
 			float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
@@ -140,9 +188,79 @@ template<typename T> struct texture_image  {
 
 			return r;
 		}
+		else {
+			/* Bicubic b-spline interpolation. */
+			float tx = frac(x*(float)width - 0.5f, &ix);
+			float ty = frac(y*(float)height - 0.5f, &iy);
+			int pix, piy, nnix, nniy;
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+
+					pix = wrap_periodic(ix-1, width);
+					piy = wrap_periodic(iy-1, height);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+
+					nnix = wrap_periodic(ix+2, width);
+					nniy = wrap_periodic(iy+2, height);
+					break;
+				case EXTENSION_CLIP:
+					if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					pix = wrap_clamp(ix-1, width);
+					piy = wrap_clamp(iy-1, height);
+
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+
+					nnix = wrap_clamp(ix+2, width);
+					nniy = wrap_clamp(iy+2, height);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
+			}
+
+			const int xc[4] = {pix, ix, nix, nnix};
+			const int yc[4] = {width * piy,
+			                   width * iy,
+			                   width * niy,
+			                   width * nniy};
+			float u[4], v[4];
+			/* Some helper macro to keep code reasonable size,
+			 * let compiler to inline all the matrix multiplications.
+			 */
+#define DATA(x, y) (read(data[xc[x] + yc[y]]))
+#define TERM(col) \
+			(v[col] * (u[0] * DATA(0, col) + \
+			           u[1] * DATA(1, col) + \
+			           u[2] * DATA(2, col) + \
+			           u[3] * DATA(3, col)))
+
+			SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+			SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+			/* Actual interpolation. */
+			return TERM(0) + TERM(1) + TERM(2) + TERM(3);
+
+#undef TERM
+#undef DATA
+		}
 	}
 
-	ccl_always_inline float4 interp_3d(float x, float y, float z, bool periodic = false)
+	ccl_always_inline float4 interp_3d(float x, float y, float z)
+	{
+		return interp_3d_ex(x, y, z, interpolation);
+	}
+
+	ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
+	                                      int interpolation = INTERPOLATION_LINEAR)
 	{
 		if(UNLIKELY(!data))
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -154,41 +272,55 @@ template<typename T> struct texture_image  {
 			frac(y*(float)height, &iy);
 			frac(z*(float)depth, &iz);
 
-			if(periodic) {
-				ix = wrap_periodic(ix, width);
-				iy = wrap_periodic(iy, height);
-				iz = wrap_periodic(iz, depth);
-			}
-			else {
-				ix = wrap_clamp(ix, width);
-				iy = wrap_clamp(iy, height);
-				iz = wrap_clamp(iz, depth);
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					iz = wrap_periodic(iz, depth);
+					break;
+				case EXTENSION_CLIP:
+					if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					iz = wrap_clamp(iz, depth);
+					break;
 			}
 
 			return read(data[ix + iy*width + iz*width*height]);
 		}
-		else {
+		else if(interpolation == INTERPOLATION_LINEAR) {
 			float tx = frac(x*(float)width - 0.5f, &ix);
 			float ty = frac(y*(float)height - 0.5f, &iy);
 			float tz = frac(z*(float)depth - 0.5f, &iz);
 
-			if(periodic) {
-				ix = wrap_periodic(ix, width);
-				iy = wrap_periodic(iy, height);
-				iz = wrap_periodic(iz, depth);
-
-				nix = wrap_periodic(ix+1, width);
-				niy = wrap_periodic(iy+1, height);
-				niz = wrap_periodic(iz+1, depth);
-			}
-			else {
-				ix = wrap_clamp(ix, width);
-				iy = wrap_clamp(iy, height);
-				iz = wrap_clamp(iz, depth);
-
-				nix = wrap_clamp(ix+1, width);
-				niy = wrap_clamp(iy+1, height);
-				niz = wrap_clamp(iz+1, depth);
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					iz = wrap_periodic(iz, depth);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+					niz = wrap_periodic(iz+1, depth);
+					break;
+				case EXTENSION_CLIP:
+					if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+					niz = wrap_clamp(iz+1, depth);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					iz = wrap_clamp(iz, depth);
+					break;
 			}
 
 			float4 r;
@@ -205,6 +337,92 @@ template<typename T> struct texture_image  {
 
 			return r;
 		}
+		else {
+			/* Tricubic b-spline interpolation. */
+			const float tx = frac(x*(float)width - 0.5f, &ix);
+			const float ty = frac(y*(float)height - 0.5f, &iy);
+			const float tz = frac(z*(float)depth - 0.5f, &iz);
+			int pix, piy, piz, nnix, nniy, nniz;
+
+			switch(extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					iz = wrap_periodic(iz, depth);
+
+					pix = wrap_periodic(ix-1, width);
+					piy = wrap_periodic(iy-1, height);
+					piz = wrap_periodic(iz-1, depth);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+					niz = wrap_periodic(iz+1, depth);
+
+					nnix = wrap_periodic(ix+2, width);
+					nniy = wrap_periodic(iy+2, height);
+					nniz = wrap_periodic(iz+2, depth);
+					break;
+				case EXTENSION_CLIP:
+					if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					/* Fall through. */
+				case EXTENSION_EXTEND:
+					pix = wrap_clamp(ix-1, width);
+					piy = wrap_clamp(iy-1, height);
+					piz = wrap_clamp(iz-1, depth);
+
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+					niz = wrap_clamp(iz+1, depth);
+
+					nnix = wrap_clamp(ix+2, width);
+					nniy = wrap_clamp(iy+2, height);
+					nniz = wrap_clamp(iz+2, depth);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					iz = wrap_clamp(iz, depth);
+					break;
+			}
+
+			const int xc[4] = {pix, ix, nix, nnix};
+			const int yc[4] = {width * piy,
+			                   width * iy,
+			                   width * niy,
+			                   width * nniy};
+			const int zc[4] = {width * height * piz,
+			                   width * height * iz,
+			                   width * height * niz,
+			                   width * height * nniz};
+			float u[4], v[4], w[4];
+
+			/* Some helper macro to keep code reasonable size,
+			 * let compiler to inline all the matrix multiplications.
+			 */
+#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
+#define COL_TERM(col, row) \
+			(v[col] * (u[0] * DATA(0, col, row) + \
+			           u[1] * DATA(1, col, row) + \
+			           u[2] * DATA(2, col, row) + \
+			           u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+			(w[row] * (COL_TERM(0, row) + \
+			           COL_TERM(1, row) + \
+			           COL_TERM(2, row) + \
+			           COL_TERM(3, row)))
+
+			SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+			SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+			SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+			/* Actual interpolation. */
+			return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+		}
 	}
 
 	ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
@@ -216,7 +434,9 @@ template<typename T> struct texture_image  {
 
 	T *data;
 	int interpolation;
+	ExtensionType extension;
 	int width, height, depth;
+#undef SET_CUBIC_SPLINE_WEIGHTS
 };
 
 typedef texture<float4> texture_float4;
@@ -237,9 +457,38 @@ typedef texture_image<uchar4> texture_image_uchar4;
 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
 #define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y))
 #define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z))
+#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d_ex(x, y, z, interpolation) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d_ex(x, y, z, interpolation))
 
 #define kernel_data (kg->__data)
 
+#ifdef __KERNEL_SSE2__
+typedef vector3<sseb> sse3b;
+typedef vector3<ssef> sse3f;
+typedef vector3<ssei> sse3i;
+
+ccl_device_inline void print_sse3b(const char *label, sse3b& a)
+{
+	print_sseb(label, a.x);
+	print_sseb(label, a.y);
+	print_sseb(label, a.z);
+}
+
+ccl_device_inline void print_sse3f(const char *label, sse3f& a)
+{
+	print_ssef(label, a.x);
+	print_ssef(label, a.y);
+	print_ssef(label, a.z);
+}
+
+ccl_device_inline void print_sse3i(const char *label, sse3i& a)
+{
+	print_ssei(label, a.x);
+	print_ssei(label, a.y);
+	print_ssei(label, a.z);
+}
+
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /* __KERNEL_COMPAT_CPU_H__ */
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index e4c20d26ff1..9fdd3abfec3 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __KERNEL_COMPAT_CUDA_H__
@@ -22,6 +22,14 @@
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
 
+/* Selective nodes compilation. */
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#endif
+
 #include <cuda.h>
 #include <float.h>
 
@@ -33,6 +41,7 @@
 #define ccl_global
 #define ccl_constant
 #define ccl_may_alias
+#define ccl_addr_space
 
 /* No assert supported for CUDA */
 
@@ -75,12 +84,11 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
 
 /* Use fast math functions */
 
-#define cosf(x) __cosf(((float)x))
-#define sinf(x) __sinf(((float)x))
-#define powf(x, y) __powf(((float)x), ((float)y))
-#define tanf(x) __tanf(((float)x))
-#define logf(x) __logf(((float)x))
-#define expf(x) __expf(((float)x))
+#define cosf(x) __cosf(((float)(x)))
+#define sinf(x) __sinf(((float)(x)))
+#define powf(x, y) __powf(((float)(x)), ((float)(y)))
+#define tanf(x) __tanf(((float)(x)))
+#define logf(x) __logf(((float)(x)))
+#define expf(x) __expf(((float)(x)))
 
 #endif /* __KERNEL_COMPAT_CUDA_H__ */
-
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 9e58ebff599..e8b36d2605d 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __KERNEL_COMPAT_OPENCL_H__
@@ -24,14 +24,6 @@
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
 
-#ifdef __KERNEL_OPENCL_AMD__
-#define __CL_NO_FLOAT3__
-#endif
-
-#ifdef __CL_NO_FLOAT3__
-#define float3 float4
-#endif
-
 #ifdef __CL_NOINLINE__
 #define ccl_noinline __attribute__((noinline))
 #else
@@ -45,6 +37,22 @@
 #define ccl_may_alias
 #define ccl_constant __constant
 #define ccl_global __global
+#define ccl_local __local
+#define ccl_private __private
+
+#ifdef __SPLIT_KERNEL__
+#define ccl_addr_space __global
+#else
+#define ccl_addr_space
+#endif
+
+/* Selective nodes compilation. */
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#endif
 
 /* no assert in opencl */
 #define kernel_assert(cond)
@@ -73,11 +81,7 @@
 #endif
 
 #define make_float2(x, y) ((float2)(x, y))
-#ifdef __CL_NO_FLOAT3__
-#define make_float3(x, y, z) ((float4)(x, y, z, 0.0f))
-#else
 #define make_float3(x, y, z) ((float3)(x, y, z))
-#endif
 #define make_float4(x, y, z, w) ((float4)(x, y, z, w))
 #define make_int2(x, y) ((int2)(x, y))
 #define make_int3(x, y, z) ((int3)(x, y, z))
@@ -89,34 +93,34 @@
 #define __float_as_uint(x) as_uint(x)
 #define __int_as_float(x) as_float(x)
 #define __float_as_int(x) as_int(x)
-#define powf(x, y) pow(((float)x), ((float)y))
-#define fabsf(x) fabs(((float)x))
-#define copysignf(x, y) copysign(((float)x), ((float)y))
-#define asinf(x) asin(((float)x))
-#define acosf(x) acos(((float)x))
-#define atanf(x) atan(((float)x))
-#define floorf(x) floor(((float)x))
-#define ceilf(x) ceil(((float)x))
-#define hypotf(x, y) hypot(((float)x), ((float)y))
-#define atan2f(x, y) atan2(((float)x), ((float)y))
-#define fmaxf(x, y) fmax(((float)x), ((float)y))
-#define fminf(x, y) fmin(((float)x), ((float)y))
-#define fmodf(x, y) fmod((float)x, (float)y)
+#define powf(x, y) pow(((float)(x)), ((float)(y)))
+#define fabsf(x) fabs(((float)(x)))
+#define copysignf(x, y) copysign(((float)(x)), ((float)(y)))
+#define asinf(x) asin(((float)(x)))
+#define acosf(x) acos(((float)(x)))
+#define atanf(x) atan(((float)(x)))
+#define floorf(x) floor(((float)(x)))
+#define ceilf(x) ceil(((float)(x)))
+#define hypotf(x, y) hypot(((float)(x)), ((float)(y)))
+#define atan2f(x, y) atan2(((float)(x)), ((float)(y)))
+#define fmaxf(x, y) fmax(((float)(x)), ((float)(y)))
+#define fminf(x, y) fmin(((float)(x)), ((float)(y)))
+#define fmodf(x, y) fmod((float)(x), (float)(y))
 
 #ifndef __CL_USE_NATIVE__
-#define sinf(x) native_sin(((float)x))
-#define cosf(x) native_cos(((float)x))
-#define tanf(x) native_tan(((float)x))
-#define expf(x) native_exp(((float)x))
-#define sqrtf(x) native_sqrt(((float)x))
-#define logf(x) native_log(((float)x))
+#define sinf(x) native_sin(((float)(x)))
+#define cosf(x) native_cos(((float)(x)))
+#define tanf(x) native_tan(((float)(x)))
+#define expf(x) native_exp(((float)(x)))
+#define sqrtf(x) native_sqrt(((float)(x)))
+#define logf(x) native_log(((float)(x)))
 #else
-#define sinf(x) sin(((float)x))
-#define cosf(x) cos(((float)x))
-#define tanf(x) tan(((float)x))
-#define expf(x) exp(((float)x))
-#define sqrtf(x) sqrt(((float)x))
-#define logf(x) log(((float)x))
+#define sinf(x) sin(((float)(x)))
+#define cosf(x) cos(((float)(x)))
+#define tanf(x) tan(((float)(x)))
+#define expf(x) exp(((float)(x)))
+#define sqrtf(x) sqrt(((float)(x)))
+#define logf(x) log(((float)(x)))
 #endif
 
 /* data lookup defines */
diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h
new file mode 100644
index 00000000000..24d6458567e
--- /dev/null
+++ b/intern/cycles/kernel/kernel_debug.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void debug_data_init(DebugData *debug_data)
+{
+	debug_data->num_bvh_traversal_steps = 0;
+	debug_data->num_bvh_traversed_instances = 0;
+	debug_data->num_ray_bounces = 0;
+}
+
+ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
+                                                 ccl_global float *buffer,
+                                                 ccl_addr_space PathState *state,
+                                                 DebugData *debug_data,
+                                                 int sample)
+{
+	int flag = kernel_data.film.pass_flag;
+	if(flag & PASS_BVH_TRAVERSAL_STEPS) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversal_steps,
+		                        sample,
+		                        debug_data->num_bvh_traversal_steps);
+	}
+	if(flag & PASS_BVH_TRAVERSED_INSTANCES) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances,
+		                        sample,
+		                        debug_data->num_bvh_traversed_instances);
+	}
+	if(flag & PASS_RAY_BOUNCES) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces,
+		                        sample,
+		                        debug_data->num_ray_bounces);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_differential.h b/intern/cycles/kernel/kernel_differential.h
index daba2d927b7..ae1e70f0167 100644
--- a/intern/cycles/kernel/kernel_differential.h
+++ b/intern/cycles/kernel/kernel_differential.h
@@ -11,14 +11,14 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
 /* See "Tracing Ray Differentials", Homan Igehy, 1999. */
 
-ccl_device void differential_transfer(differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t)
+ccl_device void differential_transfer(ccl_addr_space differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t)
 {
 	/* ray differential transfer through homogeneous medium, to
 	 * compute dPdx/dy at a shading point from the incoming ray */
@@ -31,7 +31,7 @@ ccl_device void differential_transfer(differential3 *dP_, const differential3 dP
 	dP_->dy = tmpy - dot(tmpy, Ng)*tmp;
 }
 
-ccl_device void differential_incoming(differential3 *dI, const differential3 dD)
+ccl_device void differential_incoming(ccl_addr_space differential3 *dI, const differential3 dD)
 {
 	/* compute dIdx/dy at a shading point, we just need to negate the
 	 * differential of the ray direction */
@@ -40,7 +40,7 @@ ccl_device void differential_incoming(differential3 *dI, const differential3 dD)
 	dI->dy = -dD.dy;
 }
 
-ccl_device void differential_dudv(differential *du, differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng)
+ccl_device void differential_dudv(ccl_addr_space differential *du, ccl_addr_space differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng)
 {
 	/* now we have dPdx/dy from the ray differential transfer, and dPdu/dv
 	 * from the primitive, we can compute dudx/dy and dvdx/dy. these are
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 4b2bb723ab6..de9e8d77ec8 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -11,18 +11,26 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
 /* Direction Emission */
-
 ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
-	LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce)
+	LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce
+#ifdef __SPLIT_KERNEL__
+	,ShaderData *sd_input
+#endif
+)
 {
 	/* setup shading at emitter */
-	ShaderData sd;
+#ifdef __SPLIT_KERNEL__
+	ShaderData *sd = sd_input;
+#else
+	ShaderData sd_object;
+	ShaderData *sd = &sd_object;
+#endif
 	float3 eval;
 
 #ifdef __BACKGROUND_MIS__
@@ -37,23 +45,23 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		ray.dP = differential3_zero();
 		ray.dD = dI;
 
-		shader_setup_from_background(kg, &sd, &ray, bounce+1, transparent_bounce);
-		eval = shader_eval_background(kg, &sd, 0, SHADER_CONTEXT_EMISSION);
+		shader_setup_from_background(kg, sd, &ray, bounce+1, transparent_bounce);
+		eval = shader_eval_background(kg, sd, 0, SHADER_CONTEXT_EMISSION);
 	}
 	else
 #endif
 	{
-		shader_setup_from_sample(kg, &sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce);
+		shader_setup_from_sample(kg, sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce);
 
-		ls->Ng = sd.Ng;
+		ls->Ng = ccl_fetch(sd, Ng);
 
 		/* no path flag, we're evaluating this for all closures. that's weak but
 		 * we'd have to do multiple evaluations otherwise */
-		shader_eval_surface(kg, &sd, 0.0f, 0, SHADER_CONTEXT_EMISSION);
+		shader_eval_surface(kg, sd, 0.0f, 0, SHADER_CONTEXT_EMISSION);
 
 		/* evaluate emissive closure */
-		if(sd.flag & SD_EMISSION)
-			eval = shader_emissive_eval(kg, &sd);
+		if(ccl_fetch(sd, flag) & SD_EMISSION)
+			eval = shader_emissive_eval(kg, sd);
 		else
 			eval = make_float3(0.0f, 0.0f, 0.0f);
 	}
@@ -65,7 +73,11 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 
 ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 	LightSample *ls, Ray *ray, BsdfEval *eval, bool *is_lamp,
-	int bounce, int transparent_bounce)
+	int bounce, int transparent_bounce
+#ifdef __SPLIT_KERNEL__
+	, ShaderData *sd_DL
+#endif
+	)
 {
 	if(ls->pdf == 0.0f)
 		return false;
@@ -74,7 +86,14 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 	differential3 dD = differential3_zero();
 
 	/* evaluate closure */
-	float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, sd->time, bounce, transparent_bounce);
+
+	float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, ccl_fetch(sd, time),
+	                                         bounce,
+	                                         transparent_bounce
+#ifdef __SPLIT_KERNEL__
+	                                         ,sd_DL
+#endif
+	                                         );
 
 	if(is_zero(light_eval))
 		return false;
@@ -83,7 +102,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 	float bsdf_pdf;
 
 #ifdef __VOLUME__
-	if(sd->prim != PRIM_NONE)
+	if(ccl_fetch(sd, prim) != PRIM_NONE)
 		shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf);
 	else
 		shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf);
@@ -118,8 +137,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 
 	if(ls->shader & SHADER_CAST_SHADOW) {
 		/* setup ray */
-		bool transmit = (dot(sd->Ng, ls->D) < 0.0f);
-		ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng);
+		bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f);
+		ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
 
 		if(ls->t == FLT_MAX) {
 			/* distant light */
@@ -132,7 +151,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 			ray->D = normalize_len(ray->D, &ray->t);
 		}
 
-		ray->dP = sd->dP;
+		ray->dP = ccl_fetch(sd, dP);
 		ray->dD = differential3_zero();
 	}
 	else {
@@ -154,14 +173,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 	float3 L = shader_emissive_eval(kg, sd);
 
 #ifdef __HAIR__
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE))
 #else
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS))
 #endif
 	{
 		/* multiple importance sampling, get triangle light pdf,
 		 * and compute weight with respect to BSDF pdf */
-		float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t);
+		float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t);
 		float mis_weight = power_heuristic(bsdf_pdf, pdf);
 
 		return L*mis_weight;
@@ -172,7 +191,11 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 
 /* Indirect Lamp Emission */
 
-ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission)
+ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission
+#ifdef __SPLIT_KERNEL__
+                                                ,ShaderData *sd
+#endif
+                                                )
 {
 	bool hit_lamp = false;
 
@@ -188,14 +211,21 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
 		/* use visibility flag to skip lights */
 		if(ls.shader & SHADER_EXCLUDE_ANY) {
 			if(((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
-			   ((ls.shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) ||
+			   ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
+			    ((state->flag & (PATH_RAY_GLOSSY|PATH_RAY_REFLECT)) == (PATH_RAY_GLOSSY|PATH_RAY_REFLECT))) ||
 			   ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
 			   ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
 				continue;
 		}
 #endif
 
-		float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce);
+		float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time,
+		                                state->bounce,
+		                                state->transparent_bounce
+#ifdef __SPLIT_KERNEL__
+		                                ,sd
+#endif
+		                                );
 
 #ifdef __VOLUME__
 		if(state->volume_stack[0].shader != SHADER_NONE) {
@@ -224,7 +254,11 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
 
 /* Indirect Background */
 
-ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *state, Ray *ray)
+ccl_device_noinline float3 indirect_background(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray
+#ifdef __SPLIT_KERNEL__
+                                               ,ShaderData *sd_global
+#endif
+                                               )
 {
 #ifdef __BACKGROUND__
 	int shader = kernel_data.background.surface_shader;
@@ -232,18 +266,25 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta
 	/* use visibility flag to skip lights */
 	if(shader & SHADER_EXCLUDE_ANY) {
 		if(((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
-		   ((shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) ||
+		   ((shader & SHADER_EXCLUDE_GLOSSY) &&
+		    ((state->flag & (PATH_RAY_GLOSSY|PATH_RAY_REFLECT)) == (PATH_RAY_GLOSSY|PATH_RAY_REFLECT))) ||
 		   ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
 		   ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) ||
 		   ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
 			return make_float3(0.0f, 0.0f, 0.0f);
 	}
 
+#ifdef __SPLIT_KERNEL__
 	/* evaluate background closure */
+	Ray priv_ray = *ray;
+	shader_setup_from_background(kg, sd_global, &priv_ray, state->bounce+1, state->transparent_bounce);
+	float3 L = shader_eval_background(kg, sd_global, state->flag, SHADER_CONTEXT_EMISSION);
+#else
 	ShaderData sd;
 	shader_setup_from_background(kg, &sd, ray, state->bounce+1, state->transparent_bounce);
 
 	float3 L = shader_eval_background(kg, &sd, state->flag, SHADER_CONTEXT_EMISSION);
+#endif
 
 #ifdef __BACKGROUND_MIS__
 	/* check if background light exists or if we should skip pdf */
@@ -252,7 +293,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta
 	if(!(state->flag & PATH_RAY_MIS_SKIP) && res) {
 		/* multiple importance sampling, get background light pdf for ray
 		 * direction, and compute weight with respect to BSDF pdf */
-		float pdf = background_light_pdf(kg, ray->D);
+		float pdf = background_light_pdf(kg, ray->P, ray->D);
 		float mis_weight = power_heuristic(state->ray_pdf, pdf);
 
 		return L*mis_weight;
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index dc5f6e7ce38..f9e9b413898 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -27,7 +27,7 @@ ccl_device float4 film_map(KernelGlobals *kg, float4 irradiance, float scale)
 	result.z = color_scene_linear_to_srgb(result.z*exposure);
 
 	/* clamp since alpha might be > 1.0 due to russian roulette */
-	result.w = clamp(result.w, 0.0f, 1.0f);
+	result.w = saturate(result.w);
 
 	return result;
 }
@@ -37,10 +37,10 @@ ccl_device uchar4 film_float_to_byte(float4 color)
 	uchar4 result;
 
 	/* simple float to byte conversion */
-	result.x = (uchar)clamp(color.x*255.0f, 0.0f, 255.0f);
-	result.y = (uchar)clamp(color.y*255.0f, 0.0f, 255.0f);
-	result.z = (uchar)clamp(color.z*255.0f, 0.0f, 255.0f);
-	result.w = (uchar)clamp(color.w*255.0f, 0.0f, 255.0f);
+	result.x = (uchar)(saturate(color.x)*255.0f);
+	result.y = (uchar)(saturate(color.y)*255.0f);
+	result.z = (uchar)(saturate(color.z)*255.0f);
+	result.w = (uchar)(saturate(color.w)*255.0f);
 
 	return result;
 }
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 6bd2ec0662c..17fa18909c4 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Constant Globals */
@@ -80,7 +80,7 @@ typedef struct KernelGlobals {} KernelGlobals;
 
 #ifdef __KERNEL_OPENCL__
 
-typedef struct KernelGlobals {
+typedef ccl_addr_space struct KernelGlobals {
 	ccl_constant KernelData *data;
 
 #define KERNEL_TEX(type, ttype, name) \
@@ -94,7 +94,7 @@ typedef struct KernelGlobals {
 
 ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int size)
 {
-	x = clamp(x, 0.0f, 1.0f)*(size-1);
+	x = saturate(x)*(size-1);
 
 	int index = min(float_to_int(x), size-1);
 	int nindex = min(index+1, size-1);
@@ -110,7 +110,7 @@ ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int s
 
 ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
 {
-	y = clamp(y, 0.0f, 1.0f)*(ysize-1);
+	y = saturate(y)*(ysize-1);
 
 	int index = min(float_to_int(y), ysize-1);
 	int nindex = min(index+1, ysize-1);
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index 2a5b7689e57..9ba41635b9e 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* TODO(sergey): Consider moving portable ctz/clz stuff to util. */
@@ -47,6 +47,8 @@ ccl_device_inline int cmj_fast_div_pow2(int a, int b)
 #  else
 	return a >> __builtin_ctz(b);
 #  endif
+#elif defined(__KERNEL_CUDA__)
+	return a >> (__ffs(b) - 1);
 #else
 	return a/b;
 #endif
@@ -63,6 +65,8 @@ ccl_device_inline uint cmj_w_mask(uint w)
 #  else
 	return ((1 << (32 - __builtin_clz(w))) - 1);
 #  endif
+#elif defined(__KERNEL_CUDA__)
+	return ((1 << (32 - __clz(w))) - 1);
 #else
 	w |= w >> 1;
 	w |= w >> 2;
@@ -124,7 +128,7 @@ ccl_device_inline uint cmj_permute(uint i, uint l, uint p)
 			i *= 0xc860a3df;
 			i &= w;
 			i ^= i >> 5;
-		} while (i >= l);
+		} while(i >= l);
 
 		return (i + p) % l;
 	}
@@ -167,7 +171,11 @@ ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
 {
 	kernel_assert(s < N);
 
+#if defined(__KERNEL_CUDA__)
+	int m = float_to_int(__fsqrt_ru(N));
+#else
 	int m = float_to_int(sqrtf(N));
+#endif
 	int n = (N + m - 1)/m;
 	float invN = 1.0f/N;
 	float invm = 1.0f/m;
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index e7f62f230f8..7590ec2d706 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -33,11 +33,112 @@ typedef struct LightSample {
 	LightType type;		/* type of light */
 } LightSample;
 
+/* Area light sampling */
+
+/* Uses the following paper:
+ *
+ * Carlos Urena et al.
+ * An Area-Preserving Parametrization for Spherical Rectangles.
+ *
+ * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf
+ *
+ * Note: light_p is modified when sample_coord is true.
+ */
+ccl_device float area_light_sample(float3 P,
+                                   float3 *light_p,
+                                   float3 axisu, float3 axisv,
+                                   float randu, float randv,
+                                   bool sample_coord)
+{
+	/* In our name system we're using P for the center,
+	* which is o in the paper.
+	*/
+
+	float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f;
+	float axisu_len, axisv_len;
+	/* Compute local reference system R. */
+	float3 x = normalize_len(axisu, &axisu_len);
+	float3 y = normalize_len(axisv, &axisv_len);
+	float3 z = cross(x, y);
+	/* Compute rectangle coords in local reference system. */
+	float3 dir = corner - P;
+	float z0 = dot(dir, z);
+	/* Flip 'z' to make it point against Q. */
+	if(z0 > 0.0f) {
+		z *= -1.0f;
+		z0 *= -1.0f;
+	}
+	float x0 = dot(dir, x);
+	float y0 = dot(dir, y);
+	float x1 = x0 + axisu_len;
+	float y1 = y0 + axisv_len;
+	/* Create vectors to four vertices. */
+	float3 v00 = make_float3(x0, y0, z0);
+	float3 v01 = make_float3(x0, y1, z0);
+	float3 v10 = make_float3(x1, y0, z0);
+	float3 v11 = make_float3(x1, y1, z0);
+	/* Compute normals to edges. */
+	float3 n0 = normalize(cross(v00, v10));
+	float3 n1 = normalize(cross(v10, v11));
+	float3 n2 = normalize(cross(v11, v01));
+	float3 n3 = normalize(cross(v01, v00));
+	/* Compute internal angles (gamma_i). */
+	float g0 = safe_acosf(-dot(n0, n1));
+	float g1 = safe_acosf(-dot(n1, n2));
+	float g2 = safe_acosf(-dot(n2, n3));
+	float g3 = safe_acosf(-dot(n3, n0));
+	/* Compute predefined constants. */
+	float b0 = n0.z;
+	float b1 = n2.z;
+	float b0sq = b0 * b0;
+	float k = M_2PI_F - g2 - g3;
+	/* Compute solid angle from internal angles. */
+	float S = g0 + g1 - k;
+
+	if(sample_coord) {
+		/* Compute cu. */
+		float au = randu * S + k;
+		float fu = (cosf(au) * b0 - b1) / sinf(au);
+		float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
+		cu = clamp(cu, -1.0f, 1.0f);
+		/* Compute xu. */
+		float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+		xu = clamp(xu, x0, x1);
+		/* Compute yv. */
+		float z0sq = z0 * z0;
+		float y0sq = y0 * y0;
+		float y1sq = y1 * y1;
+		float d = sqrtf(xu * xu + z0sq);
+		float h0 = y0 / sqrtf(d * d + y0sq);
+		float h1 = y1 / sqrtf(d * d + y1sq);
+		float hv = h0 + randv * (h1 - h0), hv2 = hv * hv;
+		float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1;
+
+		/* Transform (xu, yv, z0) to world coords. */
+		*light_p = P + xu * x + yv * y + z0 * z;
+	}
+
+	/* return pdf */
+	if(S != 0.0f)
+		return 1.0f / S;
+	else
+		return 0.0f;
+}
+
 /* Background Light */
 
 #ifdef __BACKGROUND_MIS__
 
-ccl_device float3 background_light_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
+/* TODO(sergey): In theory it should be all fine to use noinline for all
+ * devices, but we're so close to the release so better not screw things
+ * up for CPU at least.
+ */
+#ifdef __KERNEL_GPU__
+ccl_device_noinline
+#else
+ccl_device
+#endif
+float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
 {
 	/* for the following, the CDF values are actually a pair of floats, with the
 	 * function value as X and the actual CDF as Y.  The last entry's function
@@ -107,13 +208,19 @@ ccl_device float3 background_light_sample(KernelGlobals *kg, float randu, float
 	else
 		*pdf = (cdf_u.x * cdf_v.x)/(M_2PI_F * M_PI_F * sin_theta * denom);
 
-	*pdf *= kernel_data.integrator.pdf_lights;
-
 	/* compute direction */
-	return -equirectangular_to_direction(u, v);
+	return equirectangular_to_direction(u, v);
 }
 
-ccl_device float background_light_pdf(KernelGlobals *kg, float3 direction)
+/* TODO(sergey): Same as above, after the release we should consider using
+ * 'noinline' for all devices.
+ */
+#ifdef __KERNEL_GPU__
+ccl_device_noinline
+#else
+ccl_device
+#endif
+float background_map_pdf(KernelGlobals *kg, float3 direction)
 {
 	float2 uv = direction_to_equirectangular(direction);
 	int res = kernel_data.integrator.pdf_background_res;
@@ -139,9 +246,223 @@ ccl_device float background_light_pdf(KernelGlobals *kg, float3 direction)
 	float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf, index_v * (res + 1) + index_u);
 	float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v);
 
-	float pdf = (cdf_u.x * cdf_v.x)/(M_2PI_F * M_PI_F * sin_theta * denom);
+	return (cdf_u.x * cdf_v.x)/(M_2PI_F * M_PI_F * sin_theta * denom);
+}
+
+ccl_device_inline bool background_portal_data_fetch_and_check_side(KernelGlobals *kg,
+                                                                   float3 P,
+                                                                   int index,
+                                                                   float3 *lightpos,
+                                                                   float3 *dir)
+{
+	float4 data0 = kernel_tex_fetch(__light_data, (index + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 0);
+	float4 data3 = kernel_tex_fetch(__light_data, (index + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 3);
+
+	*lightpos = make_float3(data0.y, data0.z, data0.w);
+	*dir = make_float3(data3.y, data3.z, data3.w);
+
+	/* Check whether portal is on the right side. */
+	if(dot(*dir, P - *lightpos) > 1e-5f)
+		return true;
+
+	return false;
+}
+
+ccl_device float background_portal_pdf(KernelGlobals *kg,
+                                       float3 P,
+                                       float3 direction,
+                                       int ignore_portal,
+                                       bool *is_possible)
+{
+	float portal_pdf = 0.0f;
+
+	for(int p = 0; p < kernel_data.integrator.num_portals; p++) {
+		if(p == ignore_portal)
+			continue;
+
+		float3 lightpos, dir;
+		if(!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+			continue;
+
+		if(is_possible) {
+			/* There's a portal that could be sampled from this position. */
+			*is_possible = true;
+		}
+
+		float t = -(dot(P, dir) - dot(lightpos, dir)) / dot(direction, dir);
+		if(t <= 1e-5f) {
+			/* Either behind the portal or too close. */
+			continue;
+		}
+
+		float4 data1 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 1);
+		float4 data2 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 2);
+
+		float3 axisu = make_float3(data1.y, data1.z, data1.w);
+		float3 axisv = make_float3(data2.y, data2.z, data2.w);
+
+		float3 hit = P + t*direction;
+		float3 inplane = hit - lightpos;
+		/* Skip if the the ray doesn't pass through portal. */
+		if(fabsf(dot(inplane, axisu) / dot(axisu, axisu)) > 0.5f)
+			continue;
+		if(fabsf(dot(inplane, axisv) / dot(axisv, axisv)) > 0.5f)
+			continue;
+
+		portal_pdf += area_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false);
+	}
 
-	return pdf * kernel_data.integrator.pdf_lights;
+	return kernel_data.integrator.num_portals? portal_pdf / kernel_data.integrator.num_portals: 0.0f;
+}
+
+ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
+{
+	int num_possible_portals = 0;
+	for(int p = 0; p < kernel_data.integrator.num_portals; p++) {
+		float3 lightpos, dir;
+		if(background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+			num_possible_portals++;
+	}
+	return num_possible_portals;
+}
+
+ccl_device float3 background_portal_sample(KernelGlobals *kg,
+                                           float3 P,
+                                           float randu,
+                                           float randv,
+                                           int num_possible,
+                                           int *sampled_portal,
+                                           float *pdf)
+{
+	/* Pick a portal, then re-normalize randv. */
+	randv *= num_possible;
+	int portal = (int)randv;
+	randv -= portal;
+
+	/* TODO(sergey): Some smarter way of finding portal to sample
+	 * is welcome.
+	 */
+	for(int p = 0; p < kernel_data.integrator.num_portals; p++) {
+		/* Search for the sampled portal. */
+		float3 lightpos, dir;
+		if(!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
+			continue;
+
+		if(portal == 0) {
+			/* p is the portal to be sampled. */
+			float4 data1 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 1);
+			float4 data2 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 2);
+			float3 axisu = make_float3(data1.y, data1.z, data1.w);
+			float3 axisv = make_float3(data2.y, data2.z, data2.w);
+
+			*pdf = area_light_sample(P, &lightpos,
+			                         axisu, axisv,
+			                         randu, randv,
+			                         true);
+
+			*pdf /= num_possible;
+			*sampled_portal = p;
+			return normalize(lightpos - P);
+		}
+
+		portal--;
+	}
+
+	return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device float3 background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
+{
+	/* Probability of sampling portals instead of the map. */
+	float portal_sampling_pdf = kernel_data.integrator.portal_pdf;
+
+	/* Check if there are portals in the scene which we can sample. */
+	if(portal_sampling_pdf > 0.0f) {
+		int num_portals = background_num_possible_portals(kg, P);
+		if(num_portals > 0) {
+			if(portal_sampling_pdf == 1.0f || randu < portal_sampling_pdf) {
+				if(portal_sampling_pdf < 1.0f) {
+					randu /= portal_sampling_pdf;
+				}
+				int portal;
+				float3 D = background_portal_sample(kg, P, randu, randv, num_portals, &portal, pdf);
+				if(num_portals > 1) {
+					/* Ignore the chosen portal, its pdf is already included. */
+					*pdf += background_portal_pdf(kg, P, D, portal, NULL);
+				}
+				/* We could also have sampled the map, so combine with MIS. */
+				if(portal_sampling_pdf < 1.0f) {
+					float cdf_pdf = background_map_pdf(kg, D);
+					*pdf = (portal_sampling_pdf * (*pdf)
+					     + (1.0f - portal_sampling_pdf) * cdf_pdf);
+				}
+				return D;
+			} else {
+				/* Sample map, but with nonzero portal_sampling_pdf for MIS. */
+				randu = (randu - portal_sampling_pdf) / (1.0f - portal_sampling_pdf);
+			}
+		} else {
+			/* We can't sample a portal.
+			 * Check if we can sample the map instead.
+			 */
+			if(portal_sampling_pdf == 1.0f) {
+				/* Use uniform as a fallback if we can't sample the map. */
+				*pdf = 1.0f / M_4PI_F;
+				return sample_uniform_sphere(randu, randv);
+			}
+			else {
+				portal_sampling_pdf = 0.0f;
+			}
+		}
+	}
+
+	float3 D = background_map_sample(kg, randu, randv, pdf);
+	/* Use MIS if portals could be sampled as well. */
+	if(portal_sampling_pdf > 0.0f) {
+		float portal_pdf = background_portal_pdf(kg, P, D, -1, NULL);
+		*pdf = (portal_sampling_pdf * portal_pdf
+		     + (1.0f - portal_sampling_pdf) * (*pdf));
+	}
+	return D;
+}
+
+ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction)
+{
+	/* Probability of sampling portals instead of the map. */
+	float portal_sampling_pdf = kernel_data.integrator.portal_pdf;
+
+	if(portal_sampling_pdf > 0.0f) {
+		bool is_possible = false;
+		float portal_pdf = background_portal_pdf(kg, P, direction, -1, &is_possible);
+		if(portal_pdf == 0.0f) {
+			if(portal_sampling_pdf == 1.0f) {
+				/* If there are no possible portals at this point,
+				 * the fallback sampling would have been used.
+				 * Otherwise, the direction would not be sampled at all => pdf = 0
+				 */
+				return is_possible? 0.0f: kernel_data.integrator.pdf_lights / M_4PI_F;
+			}
+			else {
+				/* We can only sample the map. */
+				return background_map_pdf(kg, direction) * kernel_data.integrator.pdf_lights;
+			}
+		} else {
+			if(portal_sampling_pdf == 1.0f) {
+				/* We can only sample portals. */
+				return portal_pdf * kernel_data.integrator.pdf_lights;
+			}
+			else {
+				/* We can sample both, so combine with MIS. */
+				return (background_map_pdf(kg, direction) * (1.0f - portal_sampling_pdf)
+				      + portal_pdf * portal_sampling_pdf) * kernel_data.integrator.pdf_lights;
+			}
+		}
+	}
+
+	/* No portals in the scene, so must sample the map.
+	 * At least one of them is always possible if we have a LIGHT_BACKGROUND.
+	 */
+	return background_map_pdf(kg, direction) * kernel_data.integrator.pdf_lights;
 }
 #endif
 
@@ -167,14 +488,6 @@ ccl_device float3 sphere_light_sample(float3 P, float3 center, float radius, flo
 	return disk_light_sample(normalize(P - center), randu, randv)*radius;
 }
 
-ccl_device float3 area_light_sample(float3 axisu, float3 axisv, float randu, float randv)
-{
-	randu = randu - 0.5f;
-	randv = randv - 0.5f;
-
-	return axisu*randu + axisv*randv;
-}
-
 ccl_device float spot_light_attenuation(float4 data1, float4 data2, LightSample *ls)
 {
 	float3 dir = make_float3(data2.y, data2.z, data2.w);
@@ -245,13 +558,14 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 #ifdef __BACKGROUND_MIS__
 	else if(type == LIGHT_BACKGROUND) {
 		/* infinite area light (e.g. light dome or env light) */
-		float3 D = background_light_sample(kg, randu, randv, &ls->pdf);
+		float3 D = -background_light_sample(kg, P, randu, randv, &ls->pdf);
 
 		ls->P = D;
 		ls->Ng = D;
 		ls->D = -D;
 		ls->t = FLT_MAX;
 		ls->eval_fac = 1.0f;
+		ls->pdf *= kernel_data.integrator.pdf_lights;
 	}
 #endif
 	else {
@@ -276,6 +590,7 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 				float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2);
 				ls->eval_fac *= spot_light_attenuation(data1, data2, ls);
 			}
+			ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 		}
 		else {
 			/* area light */
@@ -286,22 +601,31 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 			float3 axisv = make_float3(data2.y, data2.z, data2.w);
 			float3 D = make_float3(data3.y, data3.z, data3.w);
 
-			ls->P += area_light_sample(axisu, axisv, randu, randv);
+			ls->pdf = area_light_sample(P, &ls->P,
+			                          axisu, axisv,
+			                          randu, randv,
+			                          true);
+
 			ls->Ng = D;
 			ls->D = normalize_len(ls->P - P, &ls->t);
 
 			float invarea = data2.x;
-
 			ls->eval_fac = 0.25f*invarea;
-			ls->pdf = invarea;
+
+			if(dot(ls->D, D) > 0.0f)
+				ls->pdf = 0.0f;
 		}
 
 		ls->eval_fac *= kernel_data.integrator.inv_pdf_lights;
-		ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 	}
 }
 
-ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
+#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ >= 500) && (defined(i386) || defined(_M_IX86))
+ccl_device_noinline
+#else
+ccl_device
+#endif
+bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
 {
 	float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0);
 	float4 data1 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 1);
@@ -355,6 +679,7 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 		ls->D = D;
 		ls->t = FLT_MAX;
 
+		/* compute pdf */
 		float invarea = data1.w;
 		ls->pdf = invarea/(costheta*costheta*costheta);
 		ls->eval_fac = ls->pdf;
@@ -386,6 +711,10 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 			if(ls->eval_fac == 0.0f)
 				return false;
 		}
+
+		/* compute pdf */
+		if(ls->t != FLT_MAX)
+			ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 	}
 	else if(type == LIGHT_AREA) {
 		/* area light */
@@ -404,24 +733,20 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 		if(dot(D, Ng) >= 0.0f)
 			return false;
 
-		ls->P = make_float3(data0.y, data0.z, data0.w);
+		float3 light_P = make_float3(data0.y, data0.z, data0.w);
 
 		if(!ray_quad_intersect(P, D, t,
-			ls->P, axisu, axisv, &ls->P, &ls->t))
+			light_P, axisu, axisv, &ls->P, &ls->t))
 			return false;
 
 		ls->D = D;
 		ls->Ng = Ng;
-		ls->pdf = invarea;
-		ls->eval_fac = 0.25f*ls->pdf;
+		ls->pdf = area_light_sample(P, &light_P, axisu, axisv, 0, 0, false);
+		ls->eval_fac = 0.25f*invarea;
 	}
 	else
 		return false;
 
-	/* compute pdf */
-	if(ls->t != FLT_MAX)
-		ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
-
 	return true;
 }
 
@@ -514,7 +839,13 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
 
 /* Generic Light */
 
-ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float randv, float time, float3 P, LightSample *ls)
+ccl_device bool light_select_reached_max_bounces(KernelGlobals *kg, int index, int bounce)
+{
+	float4 data4 = kernel_tex_fetch(__light_data, index*LIGHT_SIZE + 4);
+	return (bounce > __float_as_int(data4.x));
+}
+
+ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float randv, float time, float3 P, int bounce, LightSample *ls)
 {
 	/* sample index */
 	int index = light_distribution_sample(kg, randt);
@@ -536,6 +867,12 @@ ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float
 	}
 	else {
 		int lamp = -prim-1;
+
+		if(UNLIKELY(light_select_reached_max_bounces(kg, lamp, bounce))) {
+			ls->pdf = 0.0f;
+			return;
+		}
+
 		lamp_light_sample(kg, lamp, randu, randv, P, ls);
 	}
 }
@@ -546,22 +883,5 @@ ccl_device int light_select_num_samples(KernelGlobals *kg, int index)
 	return __float_as_int(data3.x);
 }
 
-ccl_device int lamp_light_eval_sample(KernelGlobals *kg, float randt)
-{
-	/* sample index */
-	int index = light_distribution_sample(kg, randt);
-
-	/* fetch light data */
-	float4 l = kernel_tex_fetch(__light_distribution, index);
-	int prim = __float_as_int(l.y);
-
-	if(prim < 0) {
-		int lamp = -prim-1;
-		return lamp;
-	}
-	else
-		return LAMP_NONE;
-}
-
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index d95a5c76309..9e14d8cc7cb 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __KERNEL_MATH_H__
@@ -19,6 +19,7 @@
 
 #include "util_color.h"
 #include "util_math.h"
+#include "util_math_fast.h"
 #include "util_transform.h"
 #include "util_distort.h"
 
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index b3b6fc02894..20cf3fa931b 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -19,23 +19,49 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value)
 {
 	ccl_global float *buf = buffer;
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+	atomic_add_float(buf, value);
+#else
 	*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
 }
 
 ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value)
 {
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+	ccl_global float *buf_x = buffer + 0;
+	ccl_global float *buf_y = buffer + 1;
+	ccl_global float *buf_z = buffer + 2;
+
+	atomic_add_float(buf_x, value.x);
+	atomic_add_float(buf_y, value.y);
+	atomic_add_float(buf_z, value.z);
+#else
 	ccl_global float3 *buf = (ccl_global float3*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
 }
 
 ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value)
 {
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+	ccl_global float *buf_x = buffer + 0;
+	ccl_global float *buf_y = buffer + 1;
+	ccl_global float *buf_z = buffer + 2;
+	ccl_global float *buf_w = buffer + 3;
+
+	atomic_add_float(buf_x, value.x);
+	atomic_add_float(buf_y, value.y);
+	atomic_add_float(buf_z, value.z);
+	atomic_add_float(buf_w, value.w);
+#else
 	ccl_global float4 *buf = (ccl_global float4*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
 }
 
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
-	ShaderData *sd, int sample, PathState *state, float3 throughput)
+	ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
 {
 #ifdef __PASSES__
 	int path_flag = state->flag;
@@ -49,18 +75,18 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		return;
 	
 	if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
-		if(!(sd->flag & SD_TRANSPARENT) ||
+		if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) ||
 		   kernel_data.film.pass_alpha_threshold == 0.0f ||
 		   average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold)
 		{
 
 			if(sample == 0) {
 				if(flag & PASS_DEPTH) {
-					float depth = camera_distance(kg, sd->P);
+					float depth = camera_distance(kg, ccl_fetch(sd, P));
 					kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth);
 				}
 				if(flag & PASS_OBJECT_ID) {
-					float id = object_pass_id(kg, sd->object);
+					float id = object_pass_id(kg, ccl_fetch(sd, object));
 					kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id);
 				}
 				if(flag & PASS_MATERIAL_ID) {
@@ -70,7 +96,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 			}
 
 			if(flag & PASS_NORMAL) {
-				float3 normal = sd->N;
+				float3 normal = ccl_fetch(sd, N);
 				kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal);
 			}
 			if(flag & PASS_UV) {
@@ -101,8 +127,8 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		float mist_start = kernel_data.film.mist_start;
 		float mist_inv_depth = kernel_data.film.mist_inv_depth;
 
-		float depth = camera_distance(kg, sd->P);
-		float mist = clamp((depth - mist_start)*mist_inv_depth, 0.0f, 1.0f);
+		float depth = camera_distance(kg, ccl_fetch(sd, P));
+		float mist = saturate((depth - mist_start)*mist_inv_depth);
 
 		/* falloff */
 		float mist_falloff = kernel_data.film.mist_falloff;
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index b8994d940fd..9794ad1d180 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifdef __OSL__
@@ -42,9 +42,14 @@
 #include "kernel_path_state.h"
 #include "kernel_shadow.h"
 #include "kernel_emission.h"
+#include "kernel_path_common.h"
 #include "kernel_path_surface.h"
 #include "kernel_path_volume.h"
 
+#ifdef __KERNEL_DEBUG__
+#include "kernel_debug.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
@@ -113,7 +118,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 
 					/* direct light sampling */
 					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
-						throughput, &state, L, 1.0f, all, &volume_ray, &volume_segment);
+						throughput, &state, L, all, &volume_ray, &volume_segment);
 
 					/* indirect sample. if we use distance sampling and take just
 					 * one sample for direct and indirect light, we could share
@@ -126,9 +131,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 						rphase, rscatter, &volume_segment, NULL, true);
 				}
 
-				if(result != VOLUME_PATH_SCATTERED)
-					throughput *= volume_segment.accum_transmittance;
-
 				/* free cached steps */
 				kernel_volume_decoupled_free(kg, &volume_segment);
 
@@ -138,6 +140,9 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 					else
 						break;
 				}
+				else {
+					throughput *= volume_segment.accum_transmittance;
+				}
 			}
 			else
 #endif
@@ -269,8 +274,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 				float bssrdf_u, bssrdf_v;
 				path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
 				subsurface_scatter_step(kg, &sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
-
-				state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
 			}
 		}
 #endif
@@ -303,17 +306,17 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *
 
 	sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-	if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+	if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
 		Ray light_ray;
 		float3 ao_shadow;
 
-		light_ray.P = ray_offset(sd->P, sd->Ng);
+		light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
 		light_ray.D = ao_D;
 		light_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-		light_ray.time = sd->time;
+		light_ray.time = ccl_fetch(sd, time);
 #endif
-		light_ray.dP = sd->dP;
+		light_ray.dP = ccl_fetch(sd, dP);
 		light_ray.dD = differential3_zero();
 
 		if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
@@ -321,78 +324,8 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *
 	}
 }
 
-ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, float3 throughput)
-{
-	int num_samples = kernel_data.integrator.ao_samples;
-	float num_samples_inv = 1.0f/num_samples;
-	float ao_factor = kernel_data.background.ao_factor;
-	float3 ao_N;
-	float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-	float3 ao_alpha = shader_bsdf_alpha(kg, sd);
-
-	for(int j = 0; j < num_samples; j++) {
-		float bsdf_u, bsdf_v;
-		path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-		float3 ao_D;
-		float ao_pdf;
-
-		sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-		if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-			Ray light_ray;
-			float3 ao_shadow;
-
-			light_ray.P = ray_offset(sd->P, sd->Ng);
-			light_ray.D = ao_D;
-			light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-			light_ray.time = sd->time;
-#endif
-			light_ray.dP = sd->dP;
-			light_ray.dD = differential3_zero();
-
-			if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
-				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
-		}
-	}
-}
-
 #ifdef __SUBSURFACE__
 
-#ifdef __VOLUME__
-ccl_device void kernel_path_subsurface_update_volume_stack(KernelGlobals *kg,
-                                                           Ray *ray,
-                                                           VolumeStack *stack)
-{
-	kernel_assert(kernel_data.integrator.use_volumes);
-
-	Ray volume_ray = *ray;
-	Intersection isect;
-	const float3 Pend = volume_ray.P + volume_ray.D*volume_ray.t;
-
-	while(
-		scene_intersect(kg, &volume_ray, PATH_RAY_ALL_VISIBILITY,
-		                &isect, NULL, 0.0f, 0.0f)) {
-		ShaderData sd;
-		shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
-		kernel_volume_stack_enter_exit(kg, &sd, stack);
-
-		/* Move ray forward. */
-		volume_ray.P = ray_offset(sd.P, -sd.Ng);
-		volume_ray.D = normalize_len(Pend - volume_ray.P,
-		                             &volume_ray.t);
-
-		/* TODO(sergey): Find a faster way detecting that ray_offset moved
-		 * us pass through the end point.
-		 */
-		if(dot(ray->D, volume_ray.D) < 0.0f) {
-			break;
-		}
-	}
-}
-#endif
-
 ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, Ray *ray, float3 *throughput)
 {
 	float bssrdf_probability;
@@ -411,6 +344,8 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 		int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
 #ifdef __VOLUME__
 		Ray volume_ray = *ray;
+		bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
+		                                ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
 #endif
 
 		/* compute lighting with the BSDF closure */
@@ -419,7 +354,6 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 			PathState hit_state = *state;
 			Ray hit_ray = *ray;
 
-			hit_state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
 			hit_state.rng_offset += PRNG_BOUNCE_NUM;
 			
 			kernel_path_surface_connect_light(kg, rng, &bssrdf_sd[hit], tp, state, L);
@@ -430,12 +364,12 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 #endif
 
 #ifdef __VOLUME__
-				if(kernel_data.integrator.use_volumes) {
+				if(need_update_volume_stack) {
 					/* Setup ray from previous surface point to the new one. */
 					volume_ray.D = normalize_len(hit_ray.P - volume_ray.P,
 					                             &volume_ray.t);
 
-					kernel_path_subsurface_update_volume_stack(
+					kernel_volume_stack_update_for_subsurface(
 					    kg,
 					    &volume_ray,
 					    hit_state.volume_stack);
@@ -471,6 +405,11 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 	PathState state;
 	path_state_init(kg, &state, rng, sample, &ray);
 
+#ifdef __KERNEL_DEBUG__
+	DebugData debug_data;
+	debug_data_init(&debug_data);
+#endif
+
 	/* path iteration */
 	for(;;) {
 		/* intersect scene */
@@ -497,6 +436,14 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
 #endif
 
+#ifdef __KERNEL_DEBUG__
+		if(state.flag & PATH_RAY_CAMERA) {
+			debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
+			debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
+		}
+		debug_data.num_ray_bounces++;
+#endif
+
 #ifdef __LAMP_MIS__
 		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
 			/* ray starting from previous non-transparent bounce */
@@ -553,7 +500,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 
 					/* direct light sampling */
 					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
-						throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
+						throughput, &state, &L, all, &volume_ray, &volume_segment);
 
 					/* indirect sample. if we use distance sampling and take just
 					 * one sample for direct and indirect light, we could share
@@ -566,9 +513,6 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 						rphase, rscatter, &volume_segment, NULL, true);
 				}
 
-				if(result != VOLUME_PATH_SCATTERED)
-					throughput *= volume_segment.accum_transmittance;
-
 				/* free cached steps */
 				kernel_volume_decoupled_free(kg, &volume_segment);
 
@@ -578,6 +522,9 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 					else
 						break;
 				}
+				else {
+					throughput *= volume_segment.accum_transmittance;
+				}
 			}
 			else 
 #endif
@@ -717,460 +664,13 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 
 	kernel_write_light_passes(kg, buffer, &L, sample);
 
-	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
-}
-
-#ifdef __BRANCHED_PATH__
-
-/* branched path tracing: bounce off surface and integrate indirect light */
-ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-	RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust,
-	PathState *state, PathRadiance *L)
-{
-	for(int i = 0; i< sd->num_closure; i++) {
-		const ShaderClosure *sc = &sd->closure[i];
-
-		if(!CLOSURE_IS_BSDF(sc->type))
-			continue;
-		/* transparency is not handled here, but in outer loop */
-		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
-			continue;
-
-		int num_samples;
-
-		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
-			num_samples = kernel_data.integrator.diffuse_samples;
-		else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
-			num_samples = 1;
-		else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
-			num_samples = kernel_data.integrator.glossy_samples;
-		else
-			num_samples = kernel_data.integrator.transmission_samples;
-
-		num_samples = ceil_to_int(num_samples_adjust*num_samples);
-
-		float num_samples_inv = num_samples_adjust/num_samples;
-		RNG bsdf_rng = cmj_hash(*rng, i);
-
-		for(int j = 0; j < num_samples; j++) {
-			PathState ps = *state;
-			float3 tp = throughput;
-			Ray bsdf_ray;
-
-			if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray))
-				continue;
-
-			kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L);
-
-			/* for render passes, sum and reset indirect light pass variables
-			 * for the next samples */
-			path_radiance_sum_indirect(L);
-			path_radiance_reset_indirect(L);
-		}
-	}
-}
-
-#ifdef __SUBSURFACE__
-ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        PathRadiance *L,
-                                                        PathState *state,
-                                                        RNG *rng,
-                                                        Ray *ray,
-                                                        float3 throughput)
-{
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
-
-		if(!CLOSURE_IS_BSSRDF(sc->type))
-			continue;
-
-		/* set up random number generator */
-		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
-		int num_samples = kernel_data.integrator.subsurface_samples;
-		float num_samples_inv = 1.0f/num_samples;
-		RNG bssrdf_rng = cmj_hash(*rng, i);
-
-		state->flag |= PATH_RAY_BSSRDF_ANCESTOR;
-
-		/* do subsurface scatter step with copy of shader data, this will
-		 * replace the BSSRDF with a diffuse BSDF closure */
-		for(int j = 0; j < num_samples; j++) {
-			ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
-			float bssrdf_u, bssrdf_v;
-			path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-			int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-#ifdef __VOLUME__
-			Ray volume_ray = *ray;
-#endif
-
-			/* compute lighting with the BSDF closure */
-			for(int hit = 0; hit < num_hits; hit++) {
-				PathState hit_state = *state;
-
-				path_state_branch(&hit_state, j, num_samples);
-
-#ifdef __VOLUME__
-				if(kernel_data.integrator.use_volumes) {
-					/* Setup ray from previous surface point to the new one. */
-					float3 P = ray_offset(bssrdf_sd[hit].P, -bssrdf_sd[hit].Ng);
-					volume_ray.D = normalize_len(P - volume_ray.P,
-					                             &volume_ray.t);
-
-					kernel_path_subsurface_update_volume_stack(
-					    kg,
-					    &volume_ray,
-					    hit_state.volume_stack);
-
-					/* Move volume ray forward. */
-					volume_ray.P = P;
-				}
-#endif
-
-#if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
-				/* direct light */
-				if(kernel_data.integrator.use_direct_light) {
-					bool all = kernel_data.integrator.sample_all_lights_direct;
-					kernel_branched_path_surface_connect_light(kg, rng,
-						&bssrdf_sd[hit], &hit_state, throughput, num_samples_inv, L, all);
-				}
-#endif
-
-				/* indirect light */
-				kernel_branched_path_surface_indirect_light(kg, rng,
-					&bssrdf_sd[hit], throughput, num_samples_inv,
-					&hit_state, L);
-			}
-		}
-
-		state->flag &= ~PATH_RAY_BSSRDF_ANCESTOR;
-	}
-}
-#endif
-
-ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
-{
-	/* initialize */
-	PathRadiance L;
-	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-	float L_transparent = 0.0f;
-
-	path_radiance_init(&L, kernel_data.film.use_light_pass);
-
-	PathState state;
-	path_state_init(kg, &state, rng, sample, &ray);
-
-	for(;;) {
-		/* intersect scene */
-		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
-		float difl = 0.0f, extmax = 0.0f;
-		uint lcg_state = 0;
-
-		if(kernel_data.bvh.have_curves) {
-			if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {	
-				float3 pixdiff = ray.dD.dx + ray.dD.dy;
-				/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-				difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
-			}
-
-			extmax = kernel_data.curve.maximum_width;
-			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
-		}
-
-		bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
-		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif
-
-#ifdef __VOLUME__
-		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = ray;
-			volume_ray.t = (hit)? isect.t: FLT_MAX;
-			
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-
-#ifdef __VOLUME_DECOUPLED__
-			/* decoupled ray marching only supported on CPU */
-
-			/* cache steps along volume for repeated sampling */
-			VolumeSegment volume_segment;
-			ShaderData volume_sd;
-
-			shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
-			kernel_volume_decoupled_record(kg, &state,
-				&volume_ray, &volume_sd, &volume_segment, heterogeneous);
-
-			/* direct light sampling */
-			if(volume_segment.closure_flag & SD_SCATTER) {
-				volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
-
-				bool all = kernel_data.integrator.sample_all_lights_direct;
-
-				kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
-					throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
-
-				/* indirect light sampling */
-				int num_samples = kernel_data.integrator.volume_samples;
-				float num_samples_inv = 1.0f/num_samples;
-
-				for(int j = 0; j < num_samples; j++) {
-					/* workaround to fix correlation bug in T38710, can find better solution
-					 * in random number generator later, for now this is done here to not impact
-					 * performance of rendering without volumes */
-					RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
-
-					PathState ps = state;
-					Ray pray = ray;
-					float3 tp = throughput;
-
-					/* branch RNG state */
-					path_state_branch(&ps, j, num_samples);
-
-					/* scatter sample. if we use distance sampling and take just one
-					 * sample for direct and indirect light, we could share this
-					 * computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
-
-					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-						&ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
-						
-					(void)result;
-					kernel_assert(result == VOLUME_PATH_SCATTERED);
-
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
-						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
-					}
-				}
-			}
-
-			/* emission and transmittance */
-			if(volume_segment.closure_flag & SD_EMISSION)
-				path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
-			throughput *= volume_segment.accum_transmittance;
-
-			/* free cached steps */
-			kernel_volume_decoupled_free(kg, &volume_segment);
-#else
-			/* GPU: no decoupled ray marching, scatter probalistically */
-			int num_samples = kernel_data.integrator.volume_samples;
-			float num_samples_inv = 1.0f/num_samples;
-
-			/* todo: we should cache the shader evaluations from stepping
-			 * through the volume, for now we redo them multiple times */
-
-			for(int j = 0; j < num_samples; j++) {
-				PathState ps = state;
-				Ray pray = ray;
-				ShaderData volume_sd;
-				float3 tp = throughput * num_samples_inv;
-
-				/* branch RNG state */
-				path_state_branch(&ps, j, num_samples);
-
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous);
-				
-#ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* todo: support equiangular, MIS and all light sampling.
-					 * alternatively get decoupled ray marching working on the GPU */
-					kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L);
-
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
-						kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
-					}
-				}
-#endif
-			}
-
-			/* todo: avoid this calculation using decoupled ray marching */
-			kernel_volume_shadow(kg, &state, &volume_ray, &throughput);
-#endif
-		}
-#endif
-
-		if(!hit) {
-			/* eval background shader if nothing hit */
-			if(kernel_data.background.transparent) {
-				L_transparent += average(throughput);
-
-#ifdef __PASSES__
-				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
-					break;
-			}
-
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, &state, &ray);
-			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
-#endif
-
-			break;
-		}
-
-		/* setup shading */
-		ShaderData sd;
-		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
-		shader_eval_surface(kg, &sd, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
-		shader_merge_closures(&sd);
-
-		/* holdout */
-#ifdef __HOLDOUT__
-		if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) {
-			if(kernel_data.background.transparent) {
-				float3 holdout_weight;
-				
-				if(sd.flag & SD_HOLDOUT_MASK)
-					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-				else
-					holdout_weight = shader_holdout_eval(kg, &sd);
-
-				/* any throughput is ok, should all be identical here */
-				L_transparent += average(holdout_weight*throughput);
-			}
-
-			if(sd.flag & SD_HOLDOUT_MASK)
-				break;
-		}
-#endif
-
-		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
-
-#ifdef __EMISSION__
-		/* emission */
-		if(sd.flag & SD_EMISSION) {
-			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
-		}
-#endif
-
-		/* transparency termination */
-		if(state.flag & PATH_RAY_TRANSPARENT) {
-			/* path termination. this is a strange place to put the termination, it's
-			 * mainly due to the mixed in MIS that we use. gives too many unneeded
-			 * shader evaluations, only need emission if we are going to terminate */
-			float probability = path_state_terminate_probability(kg, &state, throughput);
-
-			if(probability == 0.0f) {
-				break;
-			}
-			else if(probability != 1.0f) {
-				float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
-
-				if(terminate >= probability)
-					break;
-
-				throughput /= probability;
-			}
-		}
-
-#ifdef __AO__
-		/* ambient occlusion */
-		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_branched_path_ao(kg, &sd, &L, &state, rng, throughput);
-		}
-#endif
-
-#ifdef __SUBSURFACE__
-		/* bssrdf scatter to a different location on the same object */
-		if(sd.flag & SD_BSSRDF) {
-			kernel_branched_path_subsurface_scatter(kg, &sd, &L, &state,
-			                                        rng, &ray, throughput);
-		}
-#endif
-
-		if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-			PathState hit_state = state;
-
-#ifdef __EMISSION__
-			/* direct light */
-			if(kernel_data.integrator.use_direct_light) {
-				bool all = kernel_data.integrator.sample_all_lights_direct;
-				kernel_branched_path_surface_connect_light(kg, rng,
-					&sd, &hit_state, throughput, 1.0f, &L, all);
-			}
-#endif
-
-			/* indirect light */
-			kernel_branched_path_surface_indirect_light(kg, rng,
-				&sd, throughput, 1.0f, &hit_state, &L);
-
-			/* continue in case of transparency */
-			throughput *= shader_bsdf_transparency(kg, &sd);
-
-			if(is_zero(throughput))
-				break;
-		}
-
-		path_state_next(kg, &state, LABEL_TRANSPARENT);
-		ray.P = ray_offset(sd.P, -sd.Ng);
-		ray.t -= sd.ray_length; /* clipping works through transparent */
-
-
-#ifdef __RAY_DIFFERENTIALS__
-		ray.dP = sd.dP;
-		ray.dD.dx = -sd.dI.dx;
-		ray.dD.dy = -sd.dI.dy;
-#endif
-
-#ifdef __VOLUME__
-		/* enter/exit volume */
-		kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
+#ifdef __KERNEL_DEBUG__
+	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
 #endif
-	}
-
-	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
-
-	kernel_write_light_passes(kg, buffer, &L, sample);
 
 	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
 }
 
-#endif
-
-ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int x, int y, RNG *rng, Ray *ray)
-{
-	float filter_u;
-	float filter_v;
-
-	int num_samples = kernel_data.integrator.aa_samples;
-
-	path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
-
-	/* sample camera ray */
-
-	float lens_u = 0.0f, lens_v = 0.0f;
-
-	if(kernel_data.cam.aperturesize > 0.0f)
-		path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
-
-	float time = 0.0f;
-
-#ifdef __CAMERA_MOTION__
-	if(kernel_data.cam.shuttertime != -1.0f)
-		time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
-#endif
-
-	camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
-}
-
 ccl_device void kernel_path_trace(KernelGlobals *kg,
 	ccl_global float *buffer, ccl_global uint *rng_state,
 	int sample, int x, int y, int offset, int stride)
@@ -1202,38 +702,5 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
 	path_rng_end(kg, rng_state, rng);
 }
 
-#ifdef __BRANCHED_PATH__
-ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
-	ccl_global float *buffer, ccl_global uint *rng_state,
-	int sample, int x, int y, int offset, int stride)
-{
-	/* buffer offset */
-	int index = offset + x + y*stride;
-	int pass_stride = kernel_data.film.pass_stride;
-
-	rng_state += index;
-	buffer += index*pass_stride;
-
-	/* initialize random numbers and ray */
-	RNG rng;
-	Ray ray;
-
-	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
-
-	/* integrate */
-	float4 L;
-
-	if(ray.t != 0.0f)
-		L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
-	else
-		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-	/* accumulate result in output buffer */
-	kernel_write_pass_float4(buffer, sample, L);
-
-	path_rng_end(kg, rng_state, rng);
-}
-#endif
-
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
new file mode 100644
index 00000000000..b6d64985f6a
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -0,0 +1,534 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __BRANCHED_PATH__
+
+ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, float3 throughput)
+{
+	int num_samples = kernel_data.integrator.ao_samples;
+	float num_samples_inv = 1.0f/num_samples;
+	float ao_factor = kernel_data.background.ao_factor;
+	float3 ao_N;
+	float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+	float3 ao_alpha = shader_bsdf_alpha(kg, sd);
+
+	for(int j = 0; j < num_samples; j++) {
+		float bsdf_u, bsdf_v;
+		path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+		float3 ao_D;
+		float ao_pdf;
+
+		sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+		if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+			Ray light_ray;
+			float3 ao_shadow;
+
+			light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+			light_ray.D = ao_D;
+			light_ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+			light_ray.time = ccl_fetch(sd, time);
+#endif
+			light_ray.dP = ccl_fetch(sd, dP);
+			light_ray.dD = differential3_zero();
+
+			if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
+				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+		}
+	}
+}
+
+
+/* bounce off surface and integrate indirect light */
+ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
+	RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust,
+	PathState *state, PathRadiance *L)
+{
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		const ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+
+		if(!CLOSURE_IS_BSDF(sc->type))
+			continue;
+		/* transparency is not handled here, but in outer loop */
+		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+			continue;
+
+		int num_samples;
+
+		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+			num_samples = kernel_data.integrator.diffuse_samples;
+		else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
+			num_samples = 1;
+		else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
+			num_samples = kernel_data.integrator.glossy_samples;
+		else
+			num_samples = kernel_data.integrator.transmission_samples;
+
+		num_samples = ceil_to_int(num_samples_adjust*num_samples);
+
+		float num_samples_inv = num_samples_adjust/num_samples;
+		RNG bsdf_rng = cmj_hash(*rng, i);
+
+		for(int j = 0; j < num_samples; j++) {
+			PathState ps = *state;
+			float3 tp = throughput;
+			Ray bsdf_ray;
+
+			if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray))
+				continue;
+
+			kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L);
+
+			/* for render passes, sum and reset indirect light pass variables
+			 * for the next samples */
+			path_radiance_sum_indirect(L);
+			path_radiance_reset_indirect(L);
+		}
+	}
+}
+
+#ifdef __SUBSURFACE__
+ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
+                                                        ShaderData *sd,
+                                                        PathRadiance *L,
+                                                        PathState *state,
+                                                        RNG *rng,
+                                                        Ray *ray,
+                                                        float3 throughput)
+{
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+
+		if(!CLOSURE_IS_BSSRDF(sc->type))
+			continue;
+
+		/* set up random number generator */
+		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
+		int num_samples = kernel_data.integrator.subsurface_samples;
+		float num_samples_inv = 1.0f/num_samples;
+		RNG bssrdf_rng = cmj_hash(*rng, i);
+
+		/* do subsurface scatter step with copy of shader data, this will
+		 * replace the BSSRDF with a diffuse BSDF closure */
+		for(int j = 0; j < num_samples; j++) {
+			ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
+			float bssrdf_u, bssrdf_v;
+			path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+			int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
+#ifdef __VOLUME__
+			Ray volume_ray = *ray;
+			bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
+			                                ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
+#endif
+
+			/* compute lighting with the BSDF closure */
+			for(int hit = 0; hit < num_hits; hit++) {
+				PathState hit_state = *state;
+
+				path_state_branch(&hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+				if(need_update_volume_stack) {
+					/* Setup ray from previous surface point to the new one. */
+					float3 P = ray_offset(bssrdf_sd[hit].P, -bssrdf_sd[hit].Ng);
+					volume_ray.D = normalize_len(P - volume_ray.P,
+					                             &volume_ray.t);
+
+					kernel_volume_stack_update_for_subsurface(
+					    kg,
+					    &volume_ray,
+					    hit_state.volume_stack);
+
+					/* Move volume ray forward. */
+					volume_ray.P = P;
+				}
+#endif
+
+#ifdef __EMISSION__
+				/* direct light */
+				if(kernel_data.integrator.use_direct_light) {
+					bool all = kernel_data.integrator.sample_all_lights_direct;
+					kernel_branched_path_surface_connect_light(kg, rng,
+						&bssrdf_sd[hit], &hit_state, throughput, num_samples_inv, L, all);
+				}
+#endif
+
+				/* indirect light */
+				kernel_branched_path_surface_indirect_light(kg, rng,
+					&bssrdf_sd[hit], throughput, num_samples_inv,
+					&hit_state, L);
+			}
+		}
+	}
+}
+#endif
+
+ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
+{
+	/* initialize */
+	PathRadiance L;
+	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+	float L_transparent = 0.0f;
+
+	path_radiance_init(&L, kernel_data.film.use_light_pass);
+
+	PathState state;
+	path_state_init(kg, &state, rng, sample, &ray);
+
+#ifdef __KERNEL_DEBUG__
+	DebugData debug_data;
+	debug_data_init(&debug_data);
+#endif
+
+	/* Main Loop
+	 * Here we only handle transparency intersections from the camera ray.
+	 * Indirect bounces are handled in kernel_branched_path_surface_indirect_light().
+	 */
+	for(;;) {
+		/* intersect scene */
+		Intersection isect;
+		uint visibility = path_state_ray_visibility(kg, &state);
+
+#ifdef __HAIR__
+		float difl = 0.0f, extmax = 0.0f;
+		uint lcg_state = 0;
+
+		if(kernel_data.bvh.have_curves) {
+			if(kernel_data.cam.resolution == 1) {
+				float3 pixdiff = ray.dD.dx + ray.dD.dy;
+				/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
+				difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+			}
+
+			extmax = kernel_data.curve.maximum_width;
+			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
+		}
+
+		bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
+#else
+		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+		debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
+		debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
+		debug_data.num_ray_bounces++;
+#endif
+
+#ifdef __VOLUME__
+		/* volume attenuation, emission, scatter */
+		if(state.volume_stack[0].shader != SHADER_NONE) {
+			Ray volume_ray = ray;
+			volume_ray.t = (hit)? isect.t: FLT_MAX;
+			
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+
+#ifdef __VOLUME_DECOUPLED__
+			/* decoupled ray marching only supported on CPU */
+
+			/* cache steps along volume for repeated sampling */
+			VolumeSegment volume_segment;
+			ShaderData volume_sd;
+
+			shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+			kernel_volume_decoupled_record(kg, &state,
+				&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+			/* direct light sampling */
+			if(volume_segment.closure_flag & SD_SCATTER) {
+				volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+
+				bool all = kernel_data.integrator.sample_all_lights_direct;
+
+				kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+					throughput, &state, &L, all, &volume_ray, &volume_segment);
+
+				/* indirect light sampling */
+				int num_samples = kernel_data.integrator.volume_samples;
+				float num_samples_inv = 1.0f/num_samples;
+
+				for(int j = 0; j < num_samples; j++) {
+					/* workaround to fix correlation bug in T38710, can find better solution
+					 * in random number generator later, for now this is done here to not impact
+					 * performance of rendering without volumes */
+					RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
+
+					PathState ps = state;
+					Ray pray = ray;
+					float3 tp = throughput;
+
+					/* branch RNG state */
+					path_state_branch(&ps, j, num_samples);
+
+					/* scatter sample. if we use distance sampling and take just one
+					 * sample for direct and indirect light, we could share this
+					 * computation, but makes code a bit complex */
+					float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
+
+					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+						&ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
+						
+					(void)result;
+					kernel_assert(result == VOLUME_PATH_SCATTERED);
+
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
+						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
+
+						/* for render passes, sum and reset indirect light pass variables
+						 * for the next samples */
+						path_radiance_sum_indirect(&L);
+						path_radiance_reset_indirect(&L);
+					}
+				}
+			}
+
+			/* emission and transmittance */
+			if(volume_segment.closure_flag & SD_EMISSION)
+				path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+			throughput *= volume_segment.accum_transmittance;
+
+			/* free cached steps */
+			kernel_volume_decoupled_free(kg, &volume_segment);
+#else
+			/* GPU: no decoupled ray marching, scatter probalistically */
+			int num_samples = kernel_data.integrator.volume_samples;
+			float num_samples_inv = 1.0f/num_samples;
+
+			/* todo: we should cache the shader evaluations from stepping
+			 * through the volume, for now we redo them multiple times */
+
+			for(int j = 0; j < num_samples; j++) {
+				PathState ps = state;
+				Ray pray = ray;
+				ShaderData volume_sd;
+				float3 tp = throughput * num_samples_inv;
+
+				/* branch RNG state */
+				path_state_branch(&ps, j, num_samples);
+
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous);
+				
+#ifdef __VOLUME_SCATTER__
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* todo: support equiangular, MIS and all light sampling.
+					 * alternatively get decoupled ray marching working on the GPU */
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L);
+
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
+						kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L);
+
+						/* for render passes, sum and reset indirect light pass variables
+						 * for the next samples */
+						path_radiance_sum_indirect(&L);
+						path_radiance_reset_indirect(&L);
+					}
+				}
+#endif
+			}
+
+			/* todo: avoid this calculation using decoupled ray marching */
+			kernel_volume_shadow(kg, &state, &volume_ray, &throughput);
+#endif
+		}
+#endif
+
+		if(!hit) {
+			/* eval background shader if nothing hit */
+			if(kernel_data.background.transparent) {
+				L_transparent += average(throughput);
+
+#ifdef __PASSES__
+				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif
+					break;
+			}
+
+#ifdef __BACKGROUND__
+			/* sample background shader */
+			float3 L_background = indirect_background(kg, &state, &ray);
+			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
+#endif
+
+			break;
+		}
+
+		/* setup shading */
+		ShaderData sd;
+		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
+		shader_eval_surface(kg, &sd, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
+		shader_merge_closures(&sd);
+
+		/* holdout */
+#ifdef __HOLDOUT__
+		if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) {
+			if(kernel_data.background.transparent) {
+				float3 holdout_weight;
+				
+				if(sd.flag & SD_HOLDOUT_MASK)
+					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+				else
+					holdout_weight = shader_holdout_eval(kg, &sd);
+
+				/* any throughput is ok, should all be identical here */
+				L_transparent += average(holdout_weight*throughput);
+			}
+
+			if(sd.flag & SD_HOLDOUT_MASK)
+				break;
+		}
+#endif
+
+		/* holdout mask objects do not write data passes */
+		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+
+#ifdef __EMISSION__
+		/* emission */
+		if(sd.flag & SD_EMISSION) {
+			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
+			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+		}
+#endif
+
+		/* transparency termination */
+		if(state.flag & PATH_RAY_TRANSPARENT) {
+			/* path termination. this is a strange place to put the termination, it's
+			 * mainly due to the mixed in MIS that we use. gives too many unneeded
+			 * shader evaluations, only need emission if we are going to terminate */
+			float probability = path_state_terminate_probability(kg, &state, throughput);
+
+			if(probability == 0.0f) {
+				break;
+			}
+			else if(probability != 1.0f) {
+				float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+
+				if(terminate >= probability)
+					break;
+
+				throughput /= probability;
+			}
+		}
+
+#ifdef __AO__
+		/* ambient occlusion */
+		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
+			kernel_branched_path_ao(kg, &sd, &L, &state, rng, throughput);
+		}
+#endif
+
+#ifdef __SUBSURFACE__
+		/* bssrdf scatter to a different location on the same object */
+		if(sd.flag & SD_BSSRDF) {
+			kernel_branched_path_subsurface_scatter(kg, &sd, &L, &state,
+			                                        rng, &ray, throughput);
+		}
+#endif
+
+		if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
+			PathState hit_state = state;
+
+#ifdef __EMISSION__
+			/* direct light */
+			if(kernel_data.integrator.use_direct_light) {
+				bool all = kernel_data.integrator.sample_all_lights_direct;
+				kernel_branched_path_surface_connect_light(kg, rng,
+					&sd, &hit_state, throughput, 1.0f, &L, all);
+			}
+#endif
+
+			/* indirect light */
+			kernel_branched_path_surface_indirect_light(kg, rng,
+				&sd, throughput, 1.0f, &hit_state, &L);
+
+			/* continue in case of transparency */
+			throughput *= shader_bsdf_transparency(kg, &sd);
+
+			if(is_zero(throughput))
+				break;
+		}
+
+		/* Update Path State */
+		state.flag |= PATH_RAY_TRANSPARENT;
+		state.transparent_bounce++;
+
+		ray.P = ray_offset(sd.P, -sd.Ng);
+		ray.t -= sd.ray_length; /* clipping works through transparent */
+
+
+#ifdef __RAY_DIFFERENTIALS__
+		ray.dP = sd.dP;
+		ray.dD.dx = -sd.dI.dx;
+		ray.dD.dy = -sd.dI.dy;
+#endif
+
+#ifdef __VOLUME__
+		/* enter/exit volume */
+		kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
+#endif
+	}
+
+	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
+
+	kernel_write_light_passes(kg, buffer, &L, sample);
+
+#ifdef __KERNEL_DEBUG__
+	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
+#endif
+
+	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+}
+
+ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
+	ccl_global float *buffer, ccl_global uint *rng_state,
+	int sample, int x, int y, int offset, int stride)
+{
+	/* buffer offset */
+	int index = offset + x + y*stride;
+	int pass_stride = kernel_data.film.pass_stride;
+
+	rng_state += index;
+	buffer += index*pass_stride;
+
+	/* initialize random numbers and ray */
+	RNG rng;
+	Ray ray;
+
+	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
+
+	/* integrate */
+	float4 L;
+
+	if(ray.t != 0.0f)
+		L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
+	else
+		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+	/* accumulate result in output buffer */
+	kernel_write_pass_float4(buffer, sample, L);
+
+	path_rng_end(kg, rng_state, rng);
+}
+
+#endif  /* __BRANCHED_PATH__ */
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
new file mode 100644
index 00000000000..1912dfa16ed
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_common.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
+                                               ccl_global uint *rng_state,
+                                               int sample,
+                                               int x, int y,
+                                               ccl_addr_space RNG *rng,
+                                               ccl_addr_space Ray *ray)
+{
+	float filter_u;
+	float filter_v;
+
+	int num_samples = kernel_data.integrator.aa_samples;
+
+	path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
+
+	/* sample camera ray */
+
+	float lens_u = 0.0f, lens_v = 0.0f;
+
+	if(kernel_data.cam.aperturesize > 0.0f)
+		path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
+
+	float time = 0.0f;
+
+#ifdef __CAMERA_MOTION__
+	if(kernel_data.cam.shuttertime != -1.0f)
+		time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
+#endif
+
+	camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index f29168642a4..15efb2371de 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -11,12 +11,12 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample, Ray *ray)
+ccl_device_inline void path_state_init(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space RNG *rng, int sample, ccl_addr_space Ray *ray)
 {
 	state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP;
 
@@ -51,7 +51,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG
 #endif
 }
 
-ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int label)
+ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathState *state, int label)
 {
 	/* ray through transparent keeps same flags from previous ray and is
 	 * not counted as a regular bounce, transparent has separate max */
@@ -106,7 +106,7 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int
 			state->flag &= ~(PATH_RAY_GLOSSY|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP);
 		}
 		else if(label & LABEL_GLOSSY) {
-			state->flag |= PATH_RAY_GLOSSY|PATH_RAY_GLOSSY_ANCESTOR;
+			state->flag |= PATH_RAY_GLOSSY;
 			state->flag &= ~(PATH_RAY_DIFFUSE|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP);
 		}
 		else {
@@ -138,7 +138,7 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s
 	return flag;
 }
 
-ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, PathState *state, const float3 throughput)
+ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput)
 {
 	if(state->flag & PATH_RAY_TRANSPARENT) {
 		/* transparent rays treated separately */
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index 9553c2da0df..fe85a6b6e4b 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -24,7 +24,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 {
 #ifdef __EMISSION__
 	/* sample illumination from lights to find path contribution */
-	if(!(sd->flag & SD_BSDF_HAS_EVAL))
+	if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))
 		return;
 
 	Ray light_ray;
@@ -32,12 +32,15 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 	bool is_lamp;
 
 #ifdef __OBJECT_MOTION__
-	light_ray.time = sd->time;
+	light_ray.time = ccl_fetch(sd, time);
 #endif
 
 	if(sample_all_lights) {
 		/* lamp sampling */
 		for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
+			if(UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce)))
+			   continue;
+
 			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
 			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
 			RNG lamp_rng = cmj_hash(*rng, i);
@@ -50,7 +53,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
 				LightSample ls;
-				lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls);
+				lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls);
 
 				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 					/* trace shadow ray */
@@ -82,7 +85,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 					light_t = 0.5f*light_t;
 
 				LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+				light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
 
 				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 					/* trace shadow ray */
@@ -103,7 +106,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 		LightSample ls;
-		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+		light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
 
 		/* sample random light */
 		if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
@@ -146,15 +149,15 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 	path_state_next(kg, state, label);
 
 	/* setup ray */
-	ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
+	ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
 	ray->D = bsdf_omega_in;
 	ray->t = FLT_MAX;
 #ifdef __RAY_DIFFERENTIALS__
-	ray->dP = sd->dP;
+	ray->dP = ccl_fetch(sd, dP);
 	ray->dD = bsdf_domega_in;
 #endif
 #ifdef __OBJECT_MOTION__
-	ray->time = sd->time;
+	ray->time = ccl_fetch(sd, time);
 #endif
 
 #ifdef __VOLUME__
@@ -178,12 +181,13 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 
 #endif
 
+#ifndef __SPLIT_KERNEL__
 /* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L)
+ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng,
+	ShaderData *sd, float3 throughput, ccl_addr_space PathState *state, PathRadiance *L)
 {
 #ifdef __EMISSION__
-	if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
+	if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
 		return;
 
 	/* sample illumination from lights to find path contribution */
@@ -196,11 +200,11 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 	bool is_lamp;
 
 #ifdef __OBJECT_MOTION__
-	light_ray.time = sd->time;
+	light_ray.time = ccl_fetch(sd, time);
 #endif
 
 	LightSample ls;
-	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+	light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
 
 	if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 		/* trace shadow ray */
@@ -213,13 +217,14 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 	}
 #endif
 }
+#endif
 
 /* path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, ccl_addr_space RNG *rng,
+	ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, ccl_addr_space Ray *ray)
 {
 	/* no BSDF? we can stop here */
-	if(sd->flag & SD_BSDF) {
+	if(ccl_fetch(sd, flag) & SD_BSDF) {
 		/* sample BSDF */
 		float bsdf_pdf;
 		BsdfEval bsdf_eval;
@@ -251,16 +256,16 @@ ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 		path_state_next(kg, state, label);
 
 		/* setup ray */
-		ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
+		ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
 		ray->D = bsdf_omega_in;
 
 		if(state->bounce == 0)
-			ray->t -= sd->ray_length; /* clipping works through transparent */
+			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;
 
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = sd->dP;
+		ray->dP = ccl_fetch(sd, dP);
 		ray->dD = bsdf_domega_in;
 #endif
 
@@ -272,16 +277,21 @@ ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 		return true;
 	}
 #ifdef __VOLUME__
-	else if(sd->flag & SD_HAS_ONLY_VOLUME) {
+	else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) {
 		/* no surface shader but have a volume shader? act transparent */
 
 		/* update path state, count as transparent */
 		path_state_next(kg, state, LABEL_TRANSPARENT);
 
+		if(state->bounce == 0)
+			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
+		else
+			ray->t = FLT_MAX;
+
 		/* setup ray position, direction stays unchanged */
-		ray->P = ray_offset(sd->P, -sd->Ng);
+		ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = sd->dP;
+		ray->dP = ccl_fetch(sd, dP);
 #endif
 
 		/* enter/exit volume */
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index da2d5e6eca8..82dc0f97622 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -40,7 +40,7 @@ ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
 	light_ray.time = sd->time;
 #endif
 
-	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
 	if(ls.pdf == 0.0f)
 		return;
 	
@@ -56,7 +56,12 @@ ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
 #endif
 }
 
-ccl_device bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
+#ifdef __KERNEL_GPU__
+ccl_device_noinline
+#else
+ccl_device
+#endif
+bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
 {
 	/* sample phase function */
@@ -102,7 +107,7 @@ ccl_device bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 
 ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
 	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L,
-	float num_samples_adjust, bool sample_all_lights, Ray *ray, const VolumeSegment *segment)
+	bool sample_all_lights, Ray *ray, const VolumeSegment *segment)
 {
 #ifdef __EMISSION__
 	if(!kernel_data.integrator.use_direct_light)
@@ -119,8 +124,11 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 	if(sample_all_lights) {
 		/* lamp sampling */
 		for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
-			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
-			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
+			if(UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce)))
+				continue;
+
+			int num_samples = light_select_num_samples(kg, i);
+			float num_samples_inv = 1.0f/(num_samples*kernel_data.integrator.num_all_lights);
 			RNG lamp_rng = cmj_hash(*rng, i);
 
 			if(kernel_data.integrator.pdf_triangles != 0.0f)
@@ -166,8 +174,8 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 
 		/* mesh light sampling */
 		if(kernel_data.integrator.pdf_triangles != 0.0f) {
-			int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
-			float num_samples_inv = num_samples_adjust/num_samples;
+			int num_samples = kernel_data.integrator.mesh_light_samples;
+			float num_samples_inv = 1.0f/num_samples;
 
 			if(kernel_data.integrator.num_all_lights)
 				num_samples_inv *= 0.5f;
@@ -183,7 +191,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 					light_t = 0.5f*light_t;
 
 				LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls);
+				light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
 
 				float3 tp = throughput;
 
@@ -198,7 +206,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 				kernel_assert(result == VOLUME_PATH_SCATTERED);
 
 				/* todo: split up light_sample so we don't have to call it again with new position */
-				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
 
 				if(ls.pdf == 0.0f)
 					continue;
@@ -222,7 +230,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 		LightSample ls;
-		light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls);
+		light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
 
 		float3 tp = throughput;
 
@@ -237,7 +245,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 		kernel_assert(result == VOLUME_PATH_SCATTERED);
 
 		/* todo: split up light_sample so we don't have to call it again with new position */
-		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
 
 		if(ls.pdf == 0.0f)
 			return;
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index 6744471d659..62922df3286 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -55,18 +55,18 @@ ccl_device float3 spherical_to_direction(float theta, float phi)
 
 /* Equirectangular coordinates <-> Cartesian direction */
 
-ccl_device float2 direction_to_equirectangular(float3 dir)
+ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
 {
-	float u = -atan2f(dir.y, dir.x)/(M_2PI_F) + 0.5f;
-	float v = atan2f(dir.z, hypotf(dir.x, dir.y))/M_PI_F + 0.5f;
+	float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
+	float v = (acosf(dir.z / len(dir)) - range.w) / range.z;
 
 	return make_float2(u, v);
 }
 
-ccl_device float3 equirectangular_to_direction(float u, float v)
+ccl_device float3 equirectangular_range_to_direction(float u, float v, float4 range)
 {
-	float phi = M_PI_F*(1.0f - 2.0f*u);
-	float theta = M_PI_F*(1.0f - v);
+	float phi = range.x*u + range.y;
+	float theta = range.z*v + range.w;
 
 	return make_float3(
 		sinf(theta)*cosf(phi),
@@ -74,6 +74,16 @@ ccl_device float3 equirectangular_to_direction(float u, float v)
 		cosf(theta));
 }
 
+ccl_device float2 direction_to_equirectangular(float3 dir)
+{
+	return direction_to_equirectangular_range(dir, make_float4(-M_2PI_F, M_PI_F, -M_PI_F, M_PI_F));
+}
+
+ccl_device float3 equirectangular_to_direction(float u, float v)
+{
+	return equirectangular_range_to_direction(u, v, make_float4(-M_2PI_F, M_PI_F, -M_PI_F, M_PI_F));
+}
+
 /* Fisheye <-> Cartesian direction */
 
 ccl_device float2 direction_to_fisheye(float3 dir, float fov)
@@ -153,6 +163,10 @@ ccl_device float3 mirrorball_to_direction(float u, float v)
 
 	dir.x = 2.0f*u - 1.0f;
 	dir.z = 2.0f*v - 1.0f;
+
+	if(dir.x*dir.x + dir.z*dir.z > 1.0f)
+		return make_float3(0.0f, 0.0f, 0.0f);
+
 	dir.y = -sqrtf(max(1.0f - dir.x*dir.x - dir.z*dir.z, 0.0f));
 
 	/* reflection */
@@ -180,7 +194,9 @@ ccl_device float3 panorama_to_direction(KernelGlobals *kg, float u, float v)
 {
 	switch(kernel_data.cam.panorama_type) {
 		case PANORAMA_EQUIRECTANGULAR:
-			return equirectangular_to_direction(u, v);
+			return equirectangular_range_to_direction(u, v, kernel_data.cam.equirectangular_range);
+		case PANORAMA_MIRRORBALL:
+			return mirrorball_to_direction(u, v);
 		case PANORAMA_FISHEYE_EQUIDISTANT:
 			return fisheye_to_direction(u, v, kernel_data.cam.fisheye_fov);
 		case PANORAMA_FISHEYE_EQUISOLID:
@@ -194,7 +210,9 @@ ccl_device float2 direction_to_panorama(KernelGlobals *kg, float3 dir)
 {
 	switch(kernel_data.cam.panorama_type) {
 		case PANORAMA_EQUIRECTANGULAR:
-			return direction_to_equirectangular(dir);
+			return direction_to_equirectangular_range(dir, kernel_data.cam.equirectangular_range);
+		case PANORAMA_MIRRORBALL:
+			return direction_to_mirrorball(dir);
 		case PANORAMA_FISHEYE_EQUIDISTANT:
 			return direction_to_fisheye(dir, kernel_data.cam.fisheye_fov);
 		case PANORAMA_FISHEYE_EQUISOLID:
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
new file mode 100644
index 00000000000..cf5614b8a86
--- /dev/null
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_QUEUE_H__
+#define __KERNEL_QUEUE_H__
+
+/*
+ * Queue utility functions for split kernel
+ */
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+/*
+ * Enqueue ray index into the queue
+ */
+ccl_device void enqueue_ray_index(
+        int ray_index,                /* Ray index to be enqueued. */
+        int queue_number,             /* Queue in which the ray index should be enqueued. */
+        ccl_global int *queues,       /* Buffer of all queues. */
+        int queue_size,               /* Size of each queue. */
+        ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */
+{
+	/* This thread's queue index. */
+	int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size);
+	queues[my_queue_index] = ray_index;
+}
+
+/*
+ * Get the ray index for this thread
+ * Returns a positive ray_index for threads that have to do some work;
+ * Returns 'QUEUE_EMPTY_SLOT' for threads that don't have any work
+ * i.e All ray's in the queue has been successfully allocated and there
+ * is no more ray to allocate to other threads.
+ */
+ccl_device int get_ray_index(
+        int thread_index,       /* Global thread index. */
+        int queue_number,       /* Queue to operate on. */
+        ccl_global int *queues, /* Buffer of all queues. */
+        int queuesize,          /* Size of a queue. */
+        int empty_queue)        /* Empty the queue slot as soon as we fetch the ray index. */
+{
+	int ray_index = queues[queue_number * queuesize + thread_index];
+	if(empty_queue && ray_index != QUEUE_EMPTY_SLOT) {
+		queues[queue_number * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+	}
+	return ray_index;
+}
+
+/* The following functions are to realize Local memory variant of enqueue ray index function. */
+
+/* All threads should call this function. */
+ccl_device void enqueue_ray_index_local(
+        int ray_index,                               /* Ray index to enqueue. */
+        int queue_number,                            /* Queue in which to enqueue ray index. */
+        char enqueue_flag,                           /* True for threads whose ray index has to be enqueued. */
+        int queuesize,                               /* queue size. */
+        ccl_local unsigned int *local_queue_atomics,   /* To to local queue atomics. */
+        ccl_global int *Queue_data,                  /* Queues. */
+        ccl_global int *Queue_index)                 /* To do global queue atomics. */
+{
+	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+	/* Get local queue id .*/
+	unsigned int lqidx;
+	if(enqueue_flag) {
+		lqidx = atomic_inc(local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* Get global queue offset. */
+	if(lidx == 0) {
+		*local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* Get global queue index and enqueue ray. */
+	if(enqueue_flag) {
+		unsigned int my_gqidx = queue_number * queuesize + (*local_queue_atomics) + lqidx;
+		Queue_data[my_gqidx] = ray_index;
+	}
+}
+
+ccl_device unsigned int get_local_queue_index(
+        int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
+        ccl_local unsigned int *local_queue_atomics)
+{
+	int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]);
+	return my_lqidx;
+}
+
+ccl_device unsigned int get_global_per_queue_offset(
+        int queue_number,
+        ccl_local unsigned int *local_queue_atomics,
+        ccl_global int* global_queue_atomics)
+{
+	unsigned int queue_offset = atomic_add(&global_queue_atomics[queue_number],
+	                                       local_queue_atomics[queue_number]);
+	return queue_offset;
+}
+
+ccl_device unsigned int get_global_queue_index(
+    int queue_number,
+    int queuesize,
+    unsigned int lqidx,
+    ccl_local unsigned int * global_per_queue_offset)
+{
+	int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
+	return my_gqidx;
+}
+
+#endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 236f74c0a82..631a2cb75de 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "kernel_jitter.h"
@@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
 	return index;
 }
 
-ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -132,7 +132,7 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int
 #endif
 }
 
-ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_inline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -149,7 +149,7 @@ ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int
 	}
 }
 
-ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy)
 {
 #ifdef __SOBOL_FULL_SCREEN__
 	uint px, py;
@@ -261,12 +261,12 @@ ccl_device uint lcg_init(uint seed)
  * For branches in the path we must be careful not to reuse the same number
  * in a sequence and offset accordingly. */
 
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
 {
 	/* the rng_offset is not increased for transparent bounces. if we do then
 	 * fully transparent objects can become subtly visible by the different
@@ -279,23 +279,23 @@ ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *r
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
 }
 
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
 }
 
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
 {
 	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
 }
 
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
 }
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index db08c328d7e..6b560f5fdb2 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /*
@@ -37,13 +37,13 @@ CCL_NAMESPACE_BEGIN
 #ifdef __OBJECT_MOTION__
 ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
 {
-	if(sd->flag & SD_OBJECT_MOTION) {
-		sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
-		sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
+	if(ccl_fetch(sd, flag) & SD_OBJECT_MOTION) {
+		ccl_fetch(sd, ob_tfm) = object_fetch_transform_motion(kg, ccl_fetch(sd, object), time);
+		ccl_fetch(sd, ob_itfm) = transform_quick_inverse(ccl_fetch(sd, ob_tfm));
 	}
 	else {
-		sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
-		sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+		ccl_fetch(sd, ob_tfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+		ccl_fetch(sd, ob_itfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
 	}
 }
 #endif
@@ -52,55 +52,55 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 	const Intersection *isect, const Ray *ray, int bounce, int transparent_bounce)
 {
 #ifdef __INSTANCING__
-	sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
+	ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
 #endif
 
-	sd->type = isect->type;
-	sd->flag = kernel_tex_fetch(__object_flag, sd->object);
+	ccl_fetch(sd, type) = isect->type;
+	ccl_fetch(sd, flag) = kernel_tex_fetch(__object_flag, ccl_fetch(sd, object));
 
 	/* matrices and time */
 #ifdef __OBJECT_MOTION__
 	shader_setup_object_transforms(kg, sd, ray->time);
-	sd->time = ray->time;
+	ccl_fetch(sd, time) = ray->time;
 #endif
 
-	sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
-	sd->ray_length = isect->t;
-	sd->ray_depth = bounce;
-	sd->transparent_depth = transparent_bounce;
+	ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim);
+	ccl_fetch(sd, ray_length) = isect->t;
+	ccl_fetch(sd, ray_depth) = bounce;
+	ccl_fetch(sd, transparent_depth) = transparent_bounce;
 
 #ifdef __UV__
-	sd->u = isect->u;
-	sd->v = isect->v;
+	ccl_fetch(sd, u) = isect->u;
+	ccl_fetch(sd, v) = isect->v;
 #endif
 
 #ifdef __HAIR__
-	if(sd->type & PRIMITIVE_ALL_CURVE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 		/* curve */
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
 
-		sd->shader = __float_as_int(curvedata.z);
-		sd->P = bvh_curve_refine(kg, sd, isect, ray);
+		ccl_fetch(sd, shader) = __float_as_int(curvedata.z);
+		ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray);
 	}
 	else
 #endif
-	if(sd->type & PRIMITIVE_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
 		/* static triangle */
 		float3 Ng = triangle_normal(kg, sd);
-		sd->shader =  kernel_tex_fetch(__tri_shader, sd->prim);
+		ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
 
 		/* vectors */
-		sd->P = triangle_refine(kg, sd, isect, ray);
-		sd->Ng = Ng;
-		sd->N = Ng;
+		ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray);
+		ccl_fetch(sd, Ng) = Ng;
+		ccl_fetch(sd, N) = Ng;
 		
 		/* smooth normal */
-		if(sd->shader & SHADER_SMOOTH_NORMAL)
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL)
+			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
 
 #ifdef __DPDU__
 		/* dPdu/dPdv */
-		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
 #endif
 	}
 	else {
@@ -108,40 +108,40 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 		motion_triangle_shader_setup(kg, sd, isect, ray, false);
 	}
 
-	sd->I = -ray->D;
+	ccl_fetch(sd, I) = -ray->D;
 
-	sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
+	ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
 
 #ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
-		object_normal_transform(kg, sd, &sd->N);
-		object_normal_transform(kg, sd, &sd->Ng);
+		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
+		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
 #ifdef __DPDU__
-		object_dir_transform(kg, sd, &sd->dPdu);
-		object_dir_transform(kg, sd, &sd->dPdv);
+		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
+		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
 #endif
 	}
 #endif
 
 	/* backfacing test */
-	bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+	bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
 
 	if(backfacing) {
-		sd->flag |= SD_BACKFACING;
-		sd->Ng = -sd->Ng;
-		sd->N = -sd->N;
+		ccl_fetch(sd, flag) |= SD_BACKFACING;
+		ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
+		ccl_fetch(sd, N) = -ccl_fetch(sd, N);
 #ifdef __DPDU__
-		sd->dPdu = -sd->dPdu;
-		sd->dPdv = -sd->dPdv;
+		ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
+		ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
 #endif
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
-	differential_incoming(&sd->dI, ray->dD);
-	differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
+	differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t);
+	differential_incoming(&ccl_fetch(sd, dI), ray->dD);
+	differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng));
 #endif
 }
 
@@ -166,7 +166,7 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat
 	/* fetch triangle data */
 	if(sd->type == PRIMITIVE_TRIANGLE) {
 		float3 Ng = triangle_normal(kg, sd);
-		sd->shader =  kernel_tex_fetch(__tri_shader, sd->prim);
+		sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
 
 		/* static triangle */
 		sd->P = triangle_refine_subsurface(kg, sd, isect, ray);
@@ -230,105 +230,105 @@ ccl_device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
 	int shader, int object, int prim, float u, float v, float t, float time, int bounce, int transparent_bounce)
 {
 	/* vectors */
-	sd->P = P;
-	sd->N = Ng;
-	sd->Ng = Ng;
-	sd->I = I;
-	sd->shader = shader;
-	sd->type = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE;
+	ccl_fetch(sd, P) = P;
+	ccl_fetch(sd, N) = Ng;
+	ccl_fetch(sd, Ng) = Ng;
+	ccl_fetch(sd, I) = I;
+	ccl_fetch(sd, shader) = shader;
+	ccl_fetch(sd, type) = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE;
 
 	/* primitive */
 #ifdef __INSTANCING__
-	sd->object = object;
+	ccl_fetch(sd, object) = object;
 #endif
 	/* currently no access to bvh prim index for strand sd->prim*/
-	sd->prim = prim;
+	ccl_fetch(sd, prim) = prim;
 #ifdef __UV__
-	sd->u = u;
-	sd->v = v;
+	ccl_fetch(sd, u) = u;
+	ccl_fetch(sd, v) = v;
 #endif
-	sd->ray_length = t;
-	sd->ray_depth = bounce;
-	sd->transparent_depth = transparent_bounce;
+	ccl_fetch(sd, ray_length) = t;
+	ccl_fetch(sd, ray_depth) = bounce;
+	ccl_fetch(sd, transparent_depth) = transparent_bounce;
 
 	/* detect instancing, for non-instanced the object index is -object-1 */
 #ifdef __INSTANCING__
 	bool instanced = false;
 
-	if(sd->prim != PRIM_NONE) {
-		if(sd->object >= 0)
+	if(ccl_fetch(sd, prim) != PRIM_NONE) {
+		if(ccl_fetch(sd, object) >= 0)
 			instanced = true;
 		else
 #endif
-			sd->object = ~sd->object;
+			ccl_fetch(sd, object) = ~ccl_fetch(sd, object);
 #ifdef __INSTANCING__
 	}
 #endif
 
-	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
-	if(sd->object != OBJECT_NONE) {
-		sd->flag |= kernel_tex_fetch(__object_flag, sd->object);
+	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
+	if(ccl_fetch(sd, object) != OBJECT_NONE) {
+		ccl_fetch(sd, flag) |= kernel_tex_fetch(__object_flag, ccl_fetch(sd, object));
 
 #ifdef __OBJECT_MOTION__
 		shader_setup_object_transforms(kg, sd, time);
 	}
 
-	sd->time = time;
+	ccl_fetch(sd, time) = time;
 #else
 	}
 #endif
 
-	if(sd->type & PRIMITIVE_TRIANGLE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
 		/* smooth normal */
-		if(sd->shader & SHADER_SMOOTH_NORMAL) {
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
 
 #ifdef __INSTANCING__
 			if(instanced)
-				object_normal_transform(kg, sd, &sd->N);
+				object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
 #endif
 		}
 
 		/* dPdu/dPdv */
 #ifdef __DPDU__
-		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
 
 #ifdef __INSTANCING__
 		if(instanced) {
-			object_dir_transform(kg, sd, &sd->dPdu);
-			object_dir_transform(kg, sd, &sd->dPdv);
+			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
+			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
 		}
 #endif
 #endif
 	}
 	else {
 #ifdef __DPDU__
-		sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
-		sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
+		ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
+		ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 	}
 
 	/* backfacing test */
-	if(sd->prim != PRIM_NONE) {
-		bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+	if(ccl_fetch(sd, prim) != PRIM_NONE) {
+		bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
 
 		if(backfacing) {
-			sd->flag |= SD_BACKFACING;
-			sd->Ng = -sd->Ng;
-			sd->N = -sd->N;
+			ccl_fetch(sd, flag) |= SD_BACKFACING;
+			ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
+			ccl_fetch(sd, N) = -ccl_fetch(sd, N);
 #ifdef __DPDU__
-			sd->dPdu = -sd->dPdu;
-			sd->dPdv = -sd->dPdv;
+			ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
+			ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
 #endif
 		}
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* no ray differentials here yet */
-	sd->dP = differential3_zero();
-	sd->dI = differential3_zero();
-	sd->du = differential_zero();
-	sd->dv = differential_zero();
+	ccl_fetch(sd, dP) = differential3_zero();
+	ccl_fetch(sd, dI) = differential3_zero();
+	ccl_fetch(sd, du) = differential_zero();
+	ccl_fetch(sd, dv) = differential_zero();
 #endif
 }
 
@@ -355,47 +355,46 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce)
 {
 	/* vectors */
-	sd->P = ray->D;
-	sd->N = -ray->D;
-	sd->Ng = -ray->D;
-	sd->I = -ray->D;
-	sd->shader = kernel_data.background.surface_shader;
-	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
+	ccl_fetch(sd, P) = ray->D;
+	ccl_fetch(sd, N) = -ray->D;
+	ccl_fetch(sd, Ng) = -ray->D;
+	ccl_fetch(sd, I) = -ray->D;
+	ccl_fetch(sd, shader) = kernel_data.background.surface_shader;
+	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
 #ifdef __OBJECT_MOTION__
-	sd->time = ray->time;
+	ccl_fetch(sd, time) = ray->time;
 #endif
-	sd->ray_length = 0.0f;
-	sd->ray_depth = bounce;
-	sd->transparent_depth = transparent_bounce;
+	ccl_fetch(sd, ray_length) = 0.0f;
+	ccl_fetch(sd, ray_depth) = bounce;
+	ccl_fetch(sd, transparent_depth) = transparent_bounce;
 
 #ifdef __INSTANCING__
-	sd->object = PRIM_NONE;
+	ccl_fetch(sd, object) = PRIM_NONE;
 #endif
-	sd->prim = PRIM_NONE;
+	ccl_fetch(sd, prim) = PRIM_NONE;
 #ifdef __UV__
-	sd->u = 0.0f;
-	sd->v = 0.0f;
+	ccl_fetch(sd, u) = 0.0f;
+	ccl_fetch(sd, v) = 0.0f;
 #endif
 
 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
-	sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
+	ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
+	ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	sd->dP = ray->dD;
-	differential_incoming(&sd->dI, sd->dP);
-	sd->du.dx = 0.0f;
-	sd->du.dy = 0.0f;
-	sd->dv.dx = 0.0f;
-	sd->dv.dy = 0.0f;
+	ccl_fetch(sd, dP) = ray->dD;
+	differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP));
+	ccl_fetch(sd, du) = differential_zero();
+	ccl_fetch(sd, dv) = differential_zero();
 #endif
 }
 
 /* ShaderData setup from point inside volume */
 
+#ifdef __VOLUME__
 ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce)
 {
 	/* vectors */
@@ -441,6 +440,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
 	sd->ray_P = ray->P;
 	sd->ray_dP = ray->dP;
 }
+#endif
 
 /* Merging */
 
@@ -459,7 +459,7 @@ ccl_device void shader_merge_closures(ShaderData *sd)
 				continue;
 #endif
 
-			if(!(sci->type == scj->type && sci->data0 == scj->data0 && sci->data1 == scj->data1))
+			if(!(sci->type == scj->type && sci->data0 == scj->data0 && sci->data1 == scj->data1 && sci->data2 == scj->data2))
 				continue;
 
 			if(CLOSURE_IS_BSDF_OR_BSSRDF(sci->type)) {
@@ -480,6 +480,7 @@ ccl_device void shader_merge_closures(ShaderData *sd)
 			}
 
 			sd->num_closure--;
+			kernel_assert(sd->num_closure >= 0);
 			j--;
 		}
 	}
@@ -493,11 +494,11 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa
 {
 	/* this is the veach one-sample model with balance heuristic, some pdf
 	 * factors drop out when using balance heuristic weighting */
-	for(int i = 0; i< sd->num_closure; i++) {
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
 		if(i == skip_bsdf)
 			continue;
 
-		const ShaderClosure *sc = &sd->closure[i];
+		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
@@ -515,7 +516,7 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa
 	*pdf = (sum_sample_weight > 0.0f)? sum_pdf/sum_sample_weight: 0.0f;
 }
 
-ccl_device void shader_bsdf_eval(KernelGlobals *kg, const ShaderData *sd,
+ccl_device void shader_bsdf_eval(KernelGlobals *kg, ShaderData *sd,
 	const float3 omega_in, BsdfEval *eval, float *pdf)
 {
 	bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass);
@@ -529,22 +530,22 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 {
 	int sampled = 0;
 
-	if(sd->num_closure > 1) {
+	if(ccl_fetch(sd, num_closure) > 1) {
 		/* pick a BSDF closure based on sample weights */
 		float sum = 0.0f;
 
-		for(sampled = 0; sampled < sd->num_closure; sampled++) {
-			const ShaderClosure *sc = &sd->closure[sampled];
+		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
+			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
 			
 			if(CLOSURE_IS_BSDF(sc->type))
 				sum += sc->sample_weight;
 		}
 
-		float r = sd->randb_closure*sum;
+		float r = ccl_fetch(sd, randb_closure)*sum;
 		sum = 0.0f;
 
-		for(sampled = 0; sampled < sd->num_closure; sampled++) {
-			const ShaderClosure *sc = &sd->closure[sampled];
+		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
+			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
 			
 			if(CLOSURE_IS_BSDF(sc->type)) {
 				sum += sc->sample_weight;
@@ -554,13 +555,14 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 			}
 		}
 
-		if(sampled == sd->num_closure) {
+		if(sampled == ccl_fetch(sd, num_closure)) {
 			*pdf = 0.0f;
 			return LABEL_NONE;
 		}
 	}
 
-	const ShaderClosure *sc = &sd->closure[sampled];
+	const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+
 	int label;
 	float3 eval;
 
@@ -570,7 +572,7 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
 	if(*pdf != 0.0f) {
 		bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass);
 
-		if(sd->num_closure > 1) {
+		if(ccl_fetch(sd, num_closure) > 1) {
 			float sweight = sc->sample_weight;
 			_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight);
 		}
@@ -597,8 +599,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, const ShaderData *s
 
 ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
 {
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF(sc->type))
 			bsdf_blur(kg, sc, roughness);
@@ -607,13 +609,13 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn
 
 ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 {
-	if(sd->flag & SD_HAS_ONLY_VOLUME)
+	if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)
 		return make_float3(1.0f, 1.0f, 1.0f);
 
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl
 			eval += sc->weight;
@@ -636,8 +638,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
 			eval += sc->weight;
@@ -650,8 +652,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
 			eval += sc->weight;
@@ -664,8 +666,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
 			eval += sc->weight;
@@ -678,10 +680,10 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
-		if(CLOSURE_IS_BSSRDF(sc->type))
+		if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type))
 			eval += sc->weight;
 	}
 
@@ -693,8 +695,8 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
 			eval += sc->weight*ao_factor;
@@ -702,12 +704,12 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 		}
 		else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) {
 			eval += sc->weight;
-			N += sd->N*average(sc->weight);
+			N += ccl_fetch(sd, N)*average(sc->weight);
 		}
 	}
 
 	if(is_zero(N))
-		N = sd->N;
+		N = ccl_fetch(sd, N);
 	else
 		N = normalize(N);
 
@@ -721,8 +723,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 	float texture_blur = 0.0f, weight_sum = 0.0f;
 
-	for(int i = 0; i< sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_BSSRDF(sc->type)) {
 			float avg_weight = fabsf(average(sc->weight));
@@ -735,7 +737,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	}
 
 	if(N_)
-		*N_ = (is_zero(N))? sd->N: normalize(N);
+		*N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N);
 
 	if(texture_blur_)
 		*texture_blur_ = texture_blur/weight_sum;
@@ -747,7 +749,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 
 ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc)
 {
-	return emissive_simple_eval(sd->Ng, sd->I);
+	return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I));
 }
 
 ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
@@ -755,8 +757,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
 	float3 eval;
 	eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_EMISSION(sc->type))
 			eval += emissive_eval(kg, sd, sc)*sc->weight;
@@ -771,8 +773,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 weight = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 		if(CLOSURE_IS_HOLDOUT(sc->type))
 			weight += sc->weight;
@@ -786,8 +788,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 	float randb, int path_flag, ShaderContext ctx)
 {
-	sd->num_closure = 0;
-	sd->randb_closure = randb;
+	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, randb_closure) = randb;
 
 #ifdef __OSL__
 	if(kg->osl)
@@ -798,9 +800,11 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 #ifdef __SVM__
 		svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, path_flag);
 #else
-		sd->closure->weight = make_float3(0.8f, 0.8f, 0.8f);
-		sd->closure->N = sd->N;
-		sd->flag |= bsdf_diffuse_setup(&sd->closure);
+		ccl_fetch_array(sd, closure, 0)->weight = make_float3(0.8f, 0.8f, 0.8f);
+		ccl_fetch_array(sd, closure, 0)->N = ccl_fetch(sd, N);
+		ccl_fetch_array(sd, closure, 0)->data0 = 0.0f;
+		ccl_fetch_array(sd, closure, 0)->data1 = 0.0f;
+		ccl_fetch(sd, flag) |= bsdf_diffuse_setup(ccl_fetch_array(sd, closure, 0));
 #endif
 	}
 }
@@ -809,8 +813,8 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int path_flag, ShaderContext ctx)
 {
-	sd->num_closure = 0;
-	sd->randb_closure = 0.0f;
+	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, randb_closure) = 0.0f;
 
 #ifdef __OSL__
 	if(kg->osl) {
@@ -825,8 +829,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int
 
 		float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-		for(int i = 0; i< sd->num_closure; i++) {
-			const ShaderClosure *sc = &sd->closure[i];
+		for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+			const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
 
 			if(CLOSURE_IS_BACKGROUND(sc->type))
 				eval += sc->weight;
@@ -846,7 +850,7 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int
 ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, const float3 omega_in, float *pdf,
 	int skip_phase, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
 {
-	for(int i = 0; i< sd->num_closure; i++) {
+	for(int i = 0; i < sd->num_closure; i++) {
 		if(i == skip_phase)
 			continue;
 
@@ -999,8 +1003,8 @@ ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx)
 {
-	sd->num_closure = 0;
-	sd->randb_closure = 0.0f;
+	ccl_fetch(sd, num_closure) = 0;
+	ccl_fetch(sd, randb_closure) = 0.0f;
 
 	/* this will modify sd->P */
 #ifdef __SVM__
diff --git a/intern/cycles/kernel/kernel_shaderdata_vars.h b/intern/cycles/kernel/kernel_shaderdata_vars.h
new file mode 100644
index 00000000000..b157b82e023
--- /dev/null
+++ b/intern/cycles/kernel/kernel_shaderdata_vars.h
@@ -0,0 +1,99 @@
+/*
+* Copyright 2011-2015 Blender Foundation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef SD_VAR
+#define SD_VAR(type, what)
+#endif
+#ifndef SD_CLOSURE_VAR
+#define SD_CLOSURE_VAR(type, what, max_closure)
+#endif
+
+/* position */
+SD_VAR(float3, P)
+/* smooth normal for shading */
+SD_VAR(float3, N)
+/* true geometric normal */
+SD_VAR(float3, Ng)
+/* view/incoming direction */
+SD_VAR(float3, I)
+/* shader id */
+SD_VAR(int, shader)
+/* booleans describing shader, see ShaderDataFlag */
+SD_VAR(int, flag)
+
+/* primitive id if there is one, ~0 otherwise */
+SD_VAR(int, prim)
+
+/* combined type and curve segment for hair */
+SD_VAR(int, type)
+
+/* parametric coordinates
+* - barycentric weights for triangles */
+SD_VAR(float, u)
+SD_VAR(float, v)
+/* object id if there is one, ~0 otherwise */
+SD_VAR(int, object)
+
+/* motion blur sample time */
+SD_VAR(float, time)
+
+/* length of the ray being shaded */
+SD_VAR(float, ray_length)
+
+/* ray bounce depth */
+SD_VAR(int, ray_depth)
+
+/* ray transparent depth */
+SD_VAR(int, transparent_depth)
+
+#ifdef __RAY_DIFFERENTIALS__
+/* differential of P. these are orthogonal to Ng, not N */
+SD_VAR(differential3, dP)
+/* differential of I */
+SD_VAR(differential3, dI)
+/* differential of u, v */
+SD_VAR(differential, du)
+SD_VAR(differential, dv)
+#endif
+#ifdef __DPDU__
+/* differential of P w.r.t. parametric coordinates. note that dPdu is
+* not readily suitable as a tangent for shading on triangles. */
+SD_VAR(float3, dPdu)
+SD_VAR(float3, dPdv)
+#endif
+
+#ifdef __OBJECT_MOTION__
+/* object <-> world space transformations, cached to avoid
+* re-interpolating them constantly for shading */
+SD_VAR(Transform, ob_tfm)
+SD_VAR(Transform, ob_itfm)
+#endif
+
+/* Closure data, we store a fixed array of closures */
+SD_CLOSURE_VAR(ShaderClosure, closure, MAX_CLOSURE)
+SD_VAR(int, num_closure)
+SD_VAR(float, randb_closure)
+
+/* ray start position, only set for backgrounds */
+SD_VAR(float3, ray_P)
+SD_VAR(differential3, ray_dP)
+
+#ifdef __OSL__
+SD_VAR(struct KernelGlobals *, osl_globals)
+#endif
+
+#undef SD_VAR
+#undef SD_CLOSURE_VAR
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 61954282c28..2811a8348ca 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -39,19 +39,6 @@ CCL_NAMESPACE_BEGIN
  * This is CPU only because of qsort, and malloc or high stack space usage to
  * record all these intersections. */
 
-ccl_device_noinline int shadow_intersections_compare(const void *a, const void *b)
-{
-	const Intersection *isect_a = (const Intersection*)a;
-	const Intersection *isect_b = (const Intersection*)b;
-
-	if(isect_a->t < isect_b->t)
-		return -1;
-	else if(isect_a->t > isect_b->t)
-		return 1;
-	else
-		return 0;
-}
-
 #define STACK_MAX_HITS 64
 
 ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
@@ -95,7 +82,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 			PathState ps = *state;
 #endif
 
-			qsort(hits, num_hits, sizeof(Intersection), shadow_intersections_compare);
+			qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
 
 			for(int hit = 0; hit < num_hits; hit++, isect++) {
 				/* adjust intersection distance for moving ray forward */
@@ -193,19 +180,36 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
  * potentially transparent, and only in that case start marching. this gives
  * one extra ray cast for the cases were we do want transparency. */
 
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
+ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray_input, float3 *shadow
+#ifdef __SPLIT_KERNEL__
+                                      , ShaderData *sd_mem, Intersection *isect_mem
+#endif
+                                      )
 {
 	*shadow = make_float3(1.0f, 1.0f, 1.0f);
 
-	if(ray->t == 0.0f)
+	if(ray_input->t == 0.0f)
 		return false;
 
-	Intersection isect;
-	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
+#ifdef __SPLIT_KERNEL__
+	Ray private_ray = *ray_input;
+	Ray *ray = &private_ray;
+#else
+	Ray *ray = ray_input;
+#endif
+
+#ifdef __SPLIT_KERNEL__
+	Intersection *isect = isect_mem;
+#else
+	Intersection isect_object;
+	Intersection *isect = &isect_object;
+#endif
+
+	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
 
 #ifdef __TRANSPARENT_SHADOWS__
 	if(blocked && kernel_data.integrator.transparent_shadows) {
-		if(shader_transparent_shadow(kg, &isect)) {
+		if(shader_transparent_shadow(kg, isect)) {
 			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 			float3 Pend = ray->P + ray->D*ray->t;
 			int bounce = state->transparent_bounce;
@@ -217,9 +221,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 				if(bounce >= kernel_data.integrator.transparent_max_bounce)
 					return true;
 
-				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f))
+				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
 				{
-
 #ifdef __VOLUME__
 					/* attenuation for last line segment towards light */
 					if(ps.volume_stack[0].shader != SHADER_NONE)
@@ -231,39 +234,44 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 					return false;
 				}
 
-				if(!shader_transparent_shadow(kg, &isect))
+				if(!shader_transparent_shadow(kg, isect))
 					return true;
 
 #ifdef __VOLUME__
 				/* attenuation between last surface and next surface */
 				if(ps.volume_stack[0].shader != SHADER_NONE) {
 					Ray segment_ray = *ray;
-					segment_ray.t = isect.t;
+					segment_ray.t = isect->t;
 					kernel_volume_shadow(kg, &ps, &segment_ray, &throughput);
 				}
 #endif
 
 				/* setup shader data at surface */
-				ShaderData sd;
-				shader_setup_from_ray(kg, &sd, &isect, ray, state->bounce+1, bounce);
+#ifdef __SPLIT_KERNEL__
+				ShaderData *sd = sd_mem;
+#else
+				ShaderData sd_object;
+				ShaderData *sd = &sd_object;
+#endif
+				shader_setup_from_ray(kg, sd, isect, ray, state->bounce+1, bounce);
 
 				/* attenuation from transparent surface */
-				if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-					shader_eval_surface(kg, &sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
-					throughput *= shader_bsdf_transparency(kg, &sd);
+				if(!(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)) {
+					shader_eval_surface(kg, sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
+					throughput *= shader_bsdf_transparency(kg, sd);
 				}
 
 				if(is_zero(throughput))
 					return true;
 
 				/* move ray forward */
-				ray->P = ray_offset(sd.P, -sd.Ng);
+				ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
 				if(ray->t != FLT_MAX)
 					ray->D = normalize_len(Pend - ray->P, &ray->t);
 
 #ifdef __VOLUME__
 				/* exit/enter volume */
-				kernel_volume_stack_enter_exit(kg, &sd, ps.volume_stack);
+				kernel_volume_stack_enter_exit(kg, sd, ps.volume_stack);
 #endif
 
 				bounce++;
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index fb927e81f22..2da060c32a2 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index ef46b2f707f..f545a056cc8 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef KERNEL_TEX
@@ -24,6 +24,7 @@
 
 /* bvh */
 KERNEL_TEX(float4, texture_float4, __bvh_nodes)
+KERNEL_TEX(float4, texture_float4, __bvh_leaf_nodes)
 KERNEL_TEX(float4, texture_float4, __tri_woop)
 KERNEL_TEX(uint, texture_uint, __prim_type)
 KERNEL_TEX(uint, texture_uint, __prim_visibility)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 0ec34dae87a..60973a71d20 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __KERNEL_TYPES_H__
@@ -24,12 +24,19 @@
 #define __KERNEL_CPU__
 #endif
 
+/* TODO(sergey): This is only to make it possible to include this header
+ * from outside of the kernel. but this could be done somewhat cleaner?
+ */
+#ifndef ccl_addr_space
+#define ccl_addr_space
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* constants */
 #define OBJECT_SIZE 		11
 #define OBJECT_VECTOR_SIZE	6
-#define LIGHT_SIZE			4
+#define LIGHT_SIZE			5
 #define FILTER_TABLE_SIZE	256
 #define RAMP_TABLE_SIZE		256
 #define PARTICLE_SIZE 		5
@@ -38,12 +45,6 @@ CCL_NAMESPACE_BEGIN
 #define BSSRDF_MIN_RADIUS			1e-8f
 #define BSSRDF_MAX_HITS				4
 
-#define BB_DRAPPER				800.0f
-#define BB_MAX_TABLE_RANGE		12000.0f
-#define BB_TABLE_XPOWER			1.5f
-#define BB_TABLE_YPOWER			5.0f
-#define BB_TABLE_SPACING		2.0f
-
 #define BECKMANN_TABLE_SIZE		256
 
 #define TEX_NUM_FLOAT_IMAGES	5
@@ -57,6 +58,9 @@ CCL_NAMESPACE_BEGIN
 
 /* device capabilities */
 #ifdef __KERNEL_CPU__
+#ifdef __KERNEL_SSE2__
+#  define __QBVH__
+#endif
 #define __KERNEL_SHADING__
 #define __KERNEL_ADV_SHADING__
 #define __BRANCHED_PATH__
@@ -69,6 +73,7 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME_DECOUPLED__
 #define __VOLUME_SCATTER__
 #define __SHADOW_RECORD_ALL__
+#define __VOLUME_RECORD_ALL__
 #define __CAMERA_RAY_NODES__
 #endif
 
@@ -80,7 +85,7 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME_SCATTER__
 
 /* Experimental on GPU */
-#ifdef __KERNEL_CUDA_EXPERIMENTAL__
+#ifdef __KERNEL_EXPERIMENTAL__
 #define __SUBSURFACE__
 #define __CMJ__
 #endif
@@ -92,38 +97,51 @@ CCL_NAMESPACE_BEGIN
 /* keep __KERNEL_ADV_SHADING__ in sync with opencl_kernel_use_advanced_shading! */
 
 #ifdef __KERNEL_OPENCL_NVIDIA__
-#define __KERNEL_SHADING__
-#define __KERNEL_ADV_SHADING__
+#  define __KERNEL_SHADING__
+#  define __KERNEL_ADV_SHADING__
+#  ifdef __KERNEL_EXPERIMENTAL__
+#    define __CMJ__
+#  endif
 #endif
 
 #ifdef __KERNEL_OPENCL_APPLE__
-#define __KERNEL_SHADING__
-//#define __KERNEL_ADV_SHADING__
+#  define __KERNEL_SHADING__
+#  define __KERNEL_ADV_SHADING__
+/* TODO(sergey): Currently experimental section is ignored here,
+ * this is because megakernel in device_opencl does not support
+ * custom cflags depending on the scene features.
+ */
+#  ifdef __KERNEL_EXPERIMENTAL__
+#    define __CMJ__
+#  endif
 #endif
 
 #ifdef __KERNEL_OPENCL_AMD__
-#define __CL_USE_NATIVE__
-#define __KERNEL_SHADING__
-//__KERNEL_ADV_SHADING__
-#define __MULTI_CLOSURE__
-#define __TRANSPARENT_SHADOWS__
-#define __PASSES__
-#define __BACKGROUND_MIS__
-#define __LAMP_MIS__
-#define __AO__
-//#define __CAMERA_MOTION__
-//#define __OBJECT_MOTION__
-//#define __HAIR__
-//end __KERNEL_ADV_SHADING__
+#  define __CL_USE_NATIVE__
+#  define __KERNEL_SHADING__
+#  define __MULTI_CLOSURE__
+#  define __PASSES__
+#  define __BACKGROUND_MIS__
+#  define __LAMP_MIS__
+#  define __AO__
+#  define __CAMERA_MOTION__
+#  define __OBJECT_MOTION__
+#  define __HAIR__
+#  ifdef __KERNEL_EXPERIMENTAL__
+#    define __TRANSPARENT_SHADOWS__
+#  endif
 #endif
 
 #ifdef __KERNEL_OPENCL_INTEL_CPU__
-#define __CL_USE_NATIVE__
-#define __KERNEL_SHADING__
-#define __KERNEL_ADV_SHADING__
+#  define __CL_USE_NATIVE__
+#  define __KERNEL_SHADING__
+#  define __KERNEL_ADV_SHADING__
+#  ifdef __KERNEL_EXPERIMENTAL__
+#    define __CMJ__
+#  endif
 #endif
 
-#endif
+#endif // __KERNEL_OPENCL__
 
 /* kernel features */
 #define __SOBOL__
@@ -158,6 +176,21 @@ CCL_NAMESPACE_BEGIN
 #define __HAIR__
 #endif
 
+#ifdef WITH_CYCLES_DEBUG
+#  define __KERNEL_DEBUG__
+#endif
+
+/* Scene-based selective featrues compilation/ */
+#ifdef __NO_CAMERA_MOTION__
+#  undef __CAMERA_MOTION__
+#endif
+#ifdef __NO_OBJECT_MOTION__
+#  undef __OBJECT_MOTION__
+#endif
+#ifdef __NO_HAIR__
+#  undef __HAIR__
+#endif
+
 /* Random Numbers */
 
 typedef uint RNG;
@@ -263,9 +296,7 @@ enum PathRayFlag {
 
 	PATH_RAY_MIS_SKIP = 2048,
 	PATH_RAY_DIFFUSE_ANCESTOR = 4096,
-	PATH_RAY_GLOSSY_ANCESTOR = 8192,
-	PATH_RAY_BSSRDF_ANCESTOR = 16384,
-	PATH_RAY_SINGLE_PASS_DONE = 32768,
+	PATH_RAY_SINGLE_PASS_DONE = 8192,
 
 	/* we need layer member flags to be the 20 upper bits */
 	PATH_RAY_LAYER_SHIFT = (32-20)
@@ -288,39 +319,44 @@ typedef enum ClosureLabel {
 
 typedef enum PassType {
 	PASS_NONE = 0,
-	PASS_COMBINED = 1,
-	PASS_DEPTH = 2,
-	PASS_NORMAL = 4,
-	PASS_UV = 8,
-	PASS_OBJECT_ID = 16,
-	PASS_MATERIAL_ID = 32,
-	PASS_DIFFUSE_COLOR = 64,
-	PASS_GLOSSY_COLOR = 128,
-	PASS_TRANSMISSION_COLOR = 256,
-	PASS_DIFFUSE_INDIRECT = 512,
-	PASS_GLOSSY_INDIRECT = 1024,
-	PASS_TRANSMISSION_INDIRECT = 2048,
-	PASS_DIFFUSE_DIRECT = 4096,
-	PASS_GLOSSY_DIRECT = 8192,
-	PASS_TRANSMISSION_DIRECT = 16384,
-	PASS_EMISSION = 32768,
-	PASS_BACKGROUND = 65536,
-	PASS_AO = 131072,
-	PASS_SHADOW = 262144,
-	PASS_MOTION = 524288,
-	PASS_MOTION_WEIGHT = 1048576,
-	PASS_MIST = 2097152,
-	PASS_SUBSURFACE_DIRECT = 4194304,
-	PASS_SUBSURFACE_INDIRECT = 8388608,
-	PASS_SUBSURFACE_COLOR = 16777216,
-	PASS_LIGHT = 33554432, /* no real pass, used to force use_light_pass */
+	PASS_COMBINED = (1 << 0),
+	PASS_DEPTH = (1 << 1),
+	PASS_NORMAL = (1 << 2),
+	PASS_UV = (1 << 3),
+	PASS_OBJECT_ID = (1 << 4),
+	PASS_MATERIAL_ID = (1 << 5),
+	PASS_DIFFUSE_COLOR = (1 << 6),
+	PASS_GLOSSY_COLOR = (1 << 7),
+	PASS_TRANSMISSION_COLOR = (1 << 8),
+	PASS_DIFFUSE_INDIRECT = (1 << 9),
+	PASS_GLOSSY_INDIRECT = (1 << 10),
+	PASS_TRANSMISSION_INDIRECT = (1 << 11),
+	PASS_DIFFUSE_DIRECT = (1 << 12),
+	PASS_GLOSSY_DIRECT = (1 << 13),
+	PASS_TRANSMISSION_DIRECT = (1 << 14),
+	PASS_EMISSION = (1 << 15),
+	PASS_BACKGROUND = (1 << 16),
+	PASS_AO = (1 << 17),
+	PASS_SHADOW = (1 << 18),
+	PASS_MOTION = (1 << 19),
+	PASS_MOTION_WEIGHT = (1 << 20),
+	PASS_MIST = (1 << 21),
+	PASS_SUBSURFACE_DIRECT = (1 << 22),
+	PASS_SUBSURFACE_INDIRECT = (1 << 23),
+	PASS_SUBSURFACE_COLOR = (1 << 24),
+	PASS_LIGHT = (1 << 25), /* no real pass, used to force use_light_pass */
+#ifdef __KERNEL_DEBUG__
+	PASS_BVH_TRAVERSAL_STEPS = (1 << 26),
+	PASS_BVH_TRAVERSED_INSTANCES = (1 << 27),
+	PASS_RAY_BOUNCES = (1 << 28),
+#endif
 } PassType;
 
 #define PASS_ALL (~0)
 
 #ifdef __PASSES__
 
-typedef struct PathRadiance {
+typedef ccl_addr_space struct PathRadiance {
 	int use_light_pass;
 
 	float3 emission;
@@ -372,7 +408,7 @@ typedef struct BsdfEval {
 
 #else
 
-typedef float3 PathRadiance;
+typedef ccl_addr_space float3 PathRadiance;
 typedef float3 BsdfEval;
 
 #endif
@@ -417,6 +453,7 @@ enum CameraType {
 
 enum PanoramaType {
 	PANORAMA_EQUIRECTANGULAR,
+	PANORAMA_MIRRORBALL,
 	PANORAMA_FISHEYE_EQUIDISTANT,
 	PANORAMA_FISHEYE_EQUISOLID
 };
@@ -436,10 +473,26 @@ typedef struct differential {
 /* Ray */
 
 typedef struct Ray {
+/* TODO(sergey): This is only needed because current AMD
+ * compiler has hard time building the kernel with this
+ * reshuffle. And at the same time reshuffle will cause
+ * less optimal CPU code in certain places.
+ *
+ * We'll get rid of this nasty exception once AMD compiler
+ * is fixed.
+ */
+#ifndef __KERNEL_OPENCL_AMD__
 	float3 P;		/* origin */
 	float3 D;		/* direction */
+
+	float t;		/* length of the ray */
+	float time;		/* time (for motion blur) */
+#else
 	float t;		/* length of the ray */
 	float time;		/* time (for motion blur) */
+	float3 P;		/* origin */
+	float3 D;		/* direction */
+#endif
 
 #ifdef __RAY_DIFFERENTIALS__
 	differential3 dP;
@@ -449,11 +502,16 @@ typedef struct Ray {
 
 /* Intersection */
 
-typedef struct Intersection {
+typedef ccl_addr_space struct Intersection {
 	float t, u, v;
 	int prim;
 	int object;
 	int type;
+
+#ifdef __KERNEL_DEBUG__
+	int num_traversal_steps;
+	int num_traversed_instances;
+#endif
 } Intersection;
 
 /* Primitives */
@@ -468,7 +526,12 @@ typedef enum PrimitiveType {
 	PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE|PRIMITIVE_MOTION_TRIANGLE),
 	PRIMITIVE_ALL_CURVE = (PRIMITIVE_CURVE|PRIMITIVE_MOTION_CURVE),
 	PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE|PRIMITIVE_MOTION_CURVE),
-	PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE|PRIMITIVE_ALL_CURVE)
+	PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE|PRIMITIVE_ALL_CURVE),
+
+	/* Total number of different primitives.
+	 * NOTE: This is an actual value, not a bitflag.
+	 */
+	PRIMITIVE_NUM_TOTAL = 4,
 } PrimitiveType;
 
 #define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << 16) | type)
@@ -516,6 +579,7 @@ typedef enum AttributeStandard {
 	ATTR_STD_VOLUME_FLAME,
 	ATTR_STD_VOLUME_HEAT,
 	ATTR_STD_VOLUME_VELOCITY,
+	ATTR_STD_POINTINESS,
 	ATTR_STD_NUM,
 
 	ATTR_STD_NOT_FOUND = ~0
@@ -524,39 +588,34 @@ typedef enum AttributeStandard {
 /* Closure data */
 
 #ifdef __MULTI_CLOSURE__
-#define MAX_CLOSURE 64
+#  ifndef __MAX_CLOSURE__
+#     define MAX_CLOSURE 64
+#  else
+#    define MAX_CLOSURE __MAX_CLOSURE__
+#  endif
 #else
 #define MAX_CLOSURE 1
 #endif
 
-/* TODO(sergey): This is rather nasty bug happening in here, which
- * could be simply a compilers bug for which we can't find a generic
- * platform independent workaround. Also even if it's a compiler
- * issue, it's not so simple to upgrade the compiler in the release
- * environment for linux and doing it so closer to the release is
- * rather a risky business.
- *
- * For this release it's probably safer to stick with such a rather
- * dirty solution, and look for a cleaner fix during the next release
- * cycle.
+/* This struct is to be 16 bytes aligned, we also keep some extra precautions:
+ * - All the float3 members are in the beginning of the struct, so compiler
+ *   does not put own padding trying to align this members.
+ * - We make sure OSL pointer is also 16 bytes aligned.
  */
-typedef struct ShaderClosure {
-	ClosureType type;
+typedef ccl_addr_space struct ShaderClosure {
 	float3 weight;
-#ifndef __APPLE__
+	float3 N;
+	float3 T;
+
+	ClosureType type;
 	float sample_weight;
-#endif
 	float data0;
 	float data1;
 	float data2;
+	int pad1, pad2, pad3;
 
-	float3 N;
-	float3 T;
-#ifdef __APPLE__
-	float sample_weight;
-#endif
 #ifdef __OSL__
-	void *prim;
+	void *prim, *pad4;
 #endif
 } ShaderClosure;
 
@@ -581,119 +640,70 @@ typedef enum ShaderContext {
 
 enum ShaderDataFlag {
 	/* runtime flags */
-	SD_BACKFACING = 1,		/* backside of surface? */
-	SD_EMISSION = 2,		/* have emissive closure? */
-	SD_BSDF = 4,			/* have bsdf closure? */
-	SD_BSDF_HAS_EVAL = 8,	/* have non-singular bsdf closure? */
-	SD_PHASE_HAS_EVAL = 8,	/* have non-singular phase closure? */
-	SD_BSDF_GLOSSY = 16,	/* have glossy bsdf */
-	SD_BSSRDF = 32,			/* have bssrdf */
-	SD_HOLDOUT = 64,		/* have holdout closure? */
-	SD_ABSORPTION = 128,	/* have volume absorption closure? */
-	SD_SCATTER = 256,		/* have volume phase closure? */
-	SD_AO = 512,			/* have ao closure? */
-	SD_TRANSPARENT = 1024,	/* have transparent closure? */
-
-	SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY|
-	                    SD_BSSRDF|SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO),
+	SD_BACKFACING     = (1 << 0),   /* backside of surface? */
+	SD_EMISSION       = (1 << 1),   /* have emissive closure? */
+	SD_BSDF           = (1 << 2),   /* have bsdf closure? */
+	SD_BSDF_HAS_EVAL  = (1 << 3),   /* have non-singular bsdf closure? */
+	SD_BSSRDF         = (1 << 4),   /* have bssrdf */
+	SD_HOLDOUT        = (1 << 5),   /* have holdout closure? */
+	SD_ABSORPTION     = (1 << 6),   /* have volume absorption closure? */
+	SD_SCATTER        = (1 << 7),   /* have volume phase closure? */
+	SD_AO             = (1 << 8),   /* have ao closure? */
+	SD_TRANSPARENT    = (1 << 9),  /* have transparent closure? */
+
+	SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF|
+	                    SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO),
 
 	/* shader flags */
-	SD_USE_MIS = 2048,					/* direct light sample */
-	SD_HAS_TRANSPARENT_SHADOW = 4096,	/* has transparent shadow */
-	SD_HAS_VOLUME = 8192,				/* has volume shader */
-	SD_HAS_ONLY_VOLUME = 16384,			/* has only volume shader, no surface */
-	SD_HETEROGENEOUS_VOLUME = 32768,	/* has heterogeneous volume */
-	SD_HAS_BSSRDF_BUMP = 65536,			/* bssrdf normal uses bump */
-	SD_VOLUME_EQUIANGULAR = 131072,		/* use equiangular sampling */
-	SD_VOLUME_MIS = 262144,				/* use multiple importance sampling */
+	SD_USE_MIS                = (1 << 10),  /* direct light sample */
+	SD_HAS_TRANSPARENT_SHADOW = (1 << 11),  /* has transparent shadow */
+	SD_HAS_VOLUME             = (1 << 12),  /* has volume shader */
+	SD_HAS_ONLY_VOLUME        = (1 << 13),  /* has only volume shader, no surface */
+	SD_HETEROGENEOUS_VOLUME   = (1 << 14),  /* has heterogeneous volume */
+	SD_HAS_BSSRDF_BUMP        = (1 << 15),  /* bssrdf normal uses bump */
+	SD_VOLUME_EQUIANGULAR     = (1 << 16),  /* use equiangular sampling */
+	SD_VOLUME_MIS             = (1 << 17),  /* use multiple importance sampling */
+	SD_VOLUME_CUBIC           = (1 << 18),  /* use cubic interpolation for voxels */
+	SD_HAS_BUMP               = (1 << 19),  /* has data connected to the displacement input */
 
 	SD_SHADER_FLAGS = (SD_USE_MIS|SD_HAS_TRANSPARENT_SHADOW|SD_HAS_VOLUME|
 	                   SD_HAS_ONLY_VOLUME|SD_HETEROGENEOUS_VOLUME|
-					   SD_HAS_BSSRDF_BUMP|SD_VOLUME_EQUIANGULAR|SD_VOLUME_MIS),
+	                   SD_HAS_BSSRDF_BUMP|SD_VOLUME_EQUIANGULAR|SD_VOLUME_MIS|
+	                   SD_VOLUME_CUBIC|SD_HAS_BUMP),
 
 	/* object flags */
-	SD_HOLDOUT_MASK = 524288,			/* holdout for camera rays */
-	SD_OBJECT_MOTION = 1048576,			/* has object motion blur */
-	SD_TRANSFORM_APPLIED = 2097152,		/* vertices have transform applied */
-	SD_NEGATIVE_SCALE_APPLIED = 4194304,	/* vertices have negative scale applied */
-
-	SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED)
+	SD_HOLDOUT_MASK             = (1 << 20),  /* holdout for camera rays */
+	SD_OBJECT_MOTION            = (1 << 21),  /* has object motion blur */
+	SD_TRANSFORM_APPLIED        = (1 << 22),  /* vertices have transform applied */
+	SD_NEGATIVE_SCALE_APPLIED   = (1 << 23),  /* vertices have negative scale applied */
+	SD_OBJECT_HAS_VOLUME        = (1 << 24),  /* object has a volume shader */
+	SD_OBJECT_INTERSECTS_VOLUME = (1 << 25),  /* object intersects AABB of an object with volume shader */
+	SD_OBJECT_HAS_VERTEX_MOTION = (1 << 26),  /* has position for motion vertices */
+
+	SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED|
+	                   SD_NEGATIVE_SCALE_APPLIED|SD_OBJECT_HAS_VOLUME|
+	                   SD_OBJECT_INTERSECTS_VOLUME)
 };
 
 struct KernelGlobals;
 
-typedef struct ShaderData {
-	/* position */
-	float3 P;
-	/* smooth normal for shading */
-	float3 N;
-	/* true geometric normal */
-	float3 Ng;
-	/* view/incoming direction */
-	float3 I;
-	/* shader id */
-	int shader;
-	/* booleans describing shader, see ShaderDataFlag */
-	int flag;
-
-	/* primitive id if there is one, ~0 otherwise */
-	int prim;
-
-	/* combined type and curve segment for hair */
-	int type;
-
-	/* parametric coordinates
-	 * - barycentric weights for triangles */
-	float u, v;
-	/* object id if there is one, ~0 otherwise */
-	int object;
-
-	/* motion blur sample time */
-	float time;
-	
-	/* length of the ray being shaded */
-	float ray_length;
-	
-	/* ray bounce depth */
-	int ray_depth;
-
-	/* ray transparent depth */
-	int transparent_depth;
-
-#ifdef __RAY_DIFFERENTIALS__
-	/* differential of P. these are orthogonal to Ng, not N */
-	differential3 dP;
-	/* differential of I */
-	differential3 dI;
-	/* differential of u, v */
-	differential du;
-	differential dv;
-#endif
-#ifdef __DPDU__
-	/* differential of P w.r.t. parametric coordinates. note that dPdu is
-	 * not readily suitable as a tangent for shading on triangles. */
-	float3 dPdu, dPdv;
-#endif
-
-#ifdef __OBJECT_MOTION__
-	/* object <-> world space transformations, cached to avoid
-	 * re-interpolating them constantly for shading */
-	Transform ob_tfm;
-	Transform ob_itfm;
+#ifdef __SPLIT_KERNEL__
+#define SD_VAR(type, what) ccl_global type *what;
+#define SD_CLOSURE_VAR(type, what, max_closure) type *what;
+#define TIDX (get_global_id(1) * get_global_size(0) + get_global_id(0))
+#define ccl_fetch(s, t) (s->t[TIDX])
+#define ccl_fetch_array(s, t, index) (&s->t[TIDX * MAX_CLOSURE + index])
+#else
+#define SD_VAR(type, what) type what;
+#define SD_CLOSURE_VAR(type, what, max_closure) type what[max_closure];
+#define ccl_fetch(s, t) (s->t)
+#define ccl_fetch_array(s, t, index) (&s->t[index])
 #endif
 
-	/* Closure data, we store a fixed array of closures */
-	ShaderClosure closure[MAX_CLOSURE];
-	int num_closure;
-	float randb_closure;
+typedef ccl_addr_space struct ShaderData {
 
-	/* ray start position, only set for backgrounds */
-	float3 ray_P;
-	differential3 ray_dP;
+#include "kernel_shaderdata_vars.h"
 
-#ifdef __OSL__
-	struct KernelGlobals *osl_globals;
-#endif
 } ShaderData;
 
 /* Path State */
@@ -711,7 +721,6 @@ typedef struct PathState {
 
 	/* random number generator state */
 	int rng_offset;    		/* dimension offset */
-	int rng_offset_bsdf;  	/* dimension offset for picking bsdf */
 	int sample;        		/* path sample number */
 	int num_samples;		/* total number of times this path will be sampled */
 
@@ -751,6 +760,7 @@ typedef struct KernelCamera {
 	int panorama_type;
 	float fisheye_fov;
 	float fisheye_lens;
+	float4 equirectangular_range;
 
 	/* matrices */
 	Transform cameratoworld;
@@ -768,7 +778,7 @@ typedef struct KernelCamera {
 
 	/* motion blur */
 	float shuttertime;
-	int have_motion;
+	int have_motion, have_perspective_motion;
 
 	/* clipping */
 	float nearclip;
@@ -789,7 +799,7 @@ typedef struct KernelCamera {
 	int shader;
 
 	float focal_length;
-	float pad[3];
+	float pad[2];
 
 	/* more matrices */
 	Transform screentoworld;
@@ -803,6 +813,11 @@ typedef struct KernelCamera {
 	Transform worldtocamera;
 
 	MotionTransform motion;
+
+	/* Denotes changes in the projective matrix, namely in rastertocamera.
+	 * Used for camera zoom motion blur,
+	 */
+	PerspectiveMotionTransform perspective_motion;
 } KernelCamera;
 
 typedef struct KernelFilm {
@@ -850,6 +865,13 @@ typedef struct KernelFilm {
 	float mist_start;
 	float mist_inv_depth;
 	float mist_falloff;
+
+#ifdef __KERNEL_DEBUG__
+	int pass_bvh_traversal_steps;
+	int pass_bvh_traversed_instances;
+	int pass_ray_bounces;
+	int pass_pad3;
+#endif
 } KernelFilm;
 
 typedef struct KernelBackground {
@@ -876,6 +898,11 @@ typedef struct KernelIntegrator {
 	float inv_pdf_lights;
 	int pdf_background_res;
 
+	/* light portals */
+	float portal_pdf;
+	int num_portals;
+	int portal_offset;
+
 	/* bounces */
 	int min_bounce;
 	int max_bounce;
@@ -928,6 +955,8 @@ typedef struct KernelIntegrator {
 	int volume_max_steps;
 	float volume_step_size;
 	int volume_samples;
+
+	int pad;
 } KernelIntegrator;
 
 typedef struct KernelBVH {
@@ -937,8 +966,8 @@ typedef struct KernelBVH {
 	int have_motion;
 	int have_curves;
 	int have_instancing;
-
-	int pad1, pad2, pad3;
+	int use_qbvh;
+	int pad1, pad2;
 } KernelBVH;
 
 typedef enum CurveFlag {
@@ -961,9 +990,8 @@ typedef struct KernelCurves {
 } KernelCurves;
 
 typedef struct KernelTables {
-	int blackbody_offset;
 	int beckmann_offset;
-	int pad1, pad2;
+	int pad1, pad2, pad3;
 } KernelTables;
 
 typedef struct KernelData {
@@ -983,6 +1011,68 @@ typedef struct CameraData {
 	int shader;
 } CameraData;
 
+#ifdef __KERNEL_DEBUG__
+typedef ccl_addr_space struct DebugData {
+	// Total number of BVH node traversal steps and primitives intersections
+	// for the camera rays.
+	int num_bvh_traversal_steps;
+	int num_bvh_traversed_instances;
+	int num_ray_bounces;
+} DebugData;
+#endif
+
+/* Declarations required for split kernel */
+
+/* Macro for queues */
+/* Value marking queue's empty slot */
+#define QUEUE_EMPTY_SLOT -1
+
+/*
+* Queue 1 - Active rays
+* Queue 2 - Background queue
+* Queue 3 - Shadow ray cast kernel - AO
+* Queeu 4 - Shadow ray cast kernel - direct lighting
+*/
+#define NUM_QUEUES 4
+
+/* Queue names */
+enum QueueNumber {
+	QUEUE_ACTIVE_AND_REGENERATED_RAYS = 0,     /* All active rays and regenerated rays are enqueued here. */
+	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1,  /* All
+	                                            * 1. Background-hit rays,
+	                                            * 2. Rays that has exited path-iteration but needs to update output buffer
+	                                            * 3. Rays to be regenerated
+	                                            * are enqueued here.
+	                                            */
+	QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2,         /* All rays for which a shadow ray should be cast to determine radiance
+	                                            * contribution for AO are enqueued here.
+	                                            */
+	QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3,         /* All rays for which a shadow ray should be cast to determine radiance
+	                                            * contributing for direct lighting are enqueued here.
+	                                            */
+};
+
+/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
+#define RAY_STATE_MASK 0x007
+#define RAY_FLAG_MASK 0x0F8
+enum RayState {
+	RAY_ACTIVE = 0,             // Denotes ray is actively involved in path-iteration
+	RAY_INACTIVE = 1,           // Denotes ray has completed processing all samples and is inactive
+	RAY_UPDATE_BUFFER = 2,      // Denoted ray has exited path-iteration and needs to update output buffer
+	RAY_HIT_BACKGROUND = 3,     // Donotes ray has hit background
+	RAY_TO_REGENERATE = 4,      // Denotes ray has to be regenerated
+	RAY_REGENERATED = 5,        // Denotes ray has been regenerated
+	RAY_SKIP_DL = 6,            // Denotes ray should skip direct lighting
+	RAY_SHADOW_RAY_CAST_AO = 16, // Flag's ray has to execute shadow blocked function in AO part
+	RAY_SHADOW_RAY_CAST_DL = 32 // Flag's ray has to execute shadow blocked function in direct lighting part
+};
+
+#define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
+#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag))
+#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
+#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
+
 CCL_NAMESPACE_END
 
 #endif /*  __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index 1273869ca28..0a74a9deba9 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -374,7 +374,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 			/* distance sampling */
 			sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
 
-			/* modifiy pdf for hit/miss decision */
+			/* modify pdf for hit/miss decision */
 			if(probalistic_scatter)
 				pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t);
 
@@ -422,7 +422,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 
 /* heterogeneous volume distance sampling: integrate stepping through the
  * volume until we reach the end, get absorbed entirely, or run out of
- * iterations. this does probalistically scatter or get transmitted through
+ * iterations. this does probabilistically scatter or get transmitted through
  * for path tracing where we don't want to branch. */
 ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
 	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng)
@@ -578,10 +578,11 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals
 /* Decoupled Volume Sampling
  *
  * VolumeSegment is list of coefficients and transmittance stored at all steps
- * through a volume. This can then latter be used for decoupled sampling as in:
+ * through a volume. This can then later be used for decoupled sampling as in:
  * "Importance Sampling Techniques for Path Tracing in Participating Media"
  *
- * On the GPU this is only supported for homogeneous volumes (1 step), due to
+ * On the GPU this is only supported (but currently not enabled)
+ * for homogeneous volumes (1 step), due to
  * no support for malloc/free and too much stack usage with a fix size array. */
 
 typedef struct VolumeStep {
@@ -595,6 +596,7 @@ typedef struct VolumeStep {
 } VolumeStep;
 
 typedef struct VolumeSegment {
+	VolumeStep stack_step;      /* stack storage for homogeneous step, to avoid malloc */
 	VolumeStep *steps;			/* recorded steps */
 	int numsteps;				/* number of steps */
 	int closure_flag;			/* accumulated closure flags from all steps */
@@ -608,7 +610,7 @@ typedef struct VolumeSegment {
 /* record volume steps to the end of the volume.
  *
  * it would be nice if we could only record up to the point that we need to scatter,
- * but the entire segment is needed to do always scattering, rather than probalistically
+ * but the entire segment is needed to do always scattering, rather than probabilistically
  * hitting or missing the volume. if we don't know the transmittance at the end of the
  * volume we can't generate stratified distance samples up to that transmittance */
 ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state,
@@ -621,17 +623,22 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 	float step_size, random_jitter_offset;
 
 	if(heterogeneous) {
-		max_steps = kernel_data.integrator.volume_max_steps;
+		const int global_max_steps = kernel_data.integrator.volume_max_steps;
 		step_size = kernel_data.integrator.volume_step_size;
-		random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size;
-
 		/* compute exact steps in advance for malloc */
 		max_steps = max((int)ceilf(ray->t/step_size), 1);
+		if(max_steps > global_max_steps) {
+			max_steps = global_max_steps;
+			step_size = ray->t / (float)max_steps;
+		}
+		segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
+		random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size;
 	}
 	else {
 		max_steps = 1;
 		step_size = ray->t;
 		random_jitter_offset = 0.0f;
+		segment->steps = &segment->stack_step;
 	}
 	
 	/* init accumulation variables */
@@ -640,10 +647,9 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 	float3 cdf_distance = make_float3(0.0f, 0.0f, 0.0f);
 	float t = 0.0f;
 
-	segment->closure_flag = 0;
 	segment->numsteps = 0;
-
-	segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
+	segment->closure_flag = 0;
+	bool is_last_step_empty = false;
 
 	VolumeStep *step = segment->steps;
 
@@ -685,12 +691,24 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 			step->closure_flag = closure_flag;
 
 			segment->closure_flag |= closure_flag;
+
+			is_last_step_empty = false;
+			segment->numsteps++;
 		}
 		else {
-			/* store empty step (todo: skip consecutive empty steps) */
-			step->sigma_t = make_float3(0.0f, 0.0f, 0.0f);
-			step->sigma_s = make_float3(0.0f, 0.0f, 0.0f);
-			step->closure_flag = 0;
+			if(is_last_step_empty) {
+				/* consecutive empty step, merge */
+				step--;
+			}
+			else {
+				/* store empty step */
+				step->sigma_t = make_float3(0.0f, 0.0f, 0.0f);
+				step->sigma_s = make_float3(0.0f, 0.0f, 0.0f);
+				step->closure_flag = 0;
+
+				segment->numsteps++;
+				is_last_step_empty = true;
+			}
 		}
 
 		step->accum_transmittance = accum_transmittance;
@@ -698,8 +716,6 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 		step->t = new_t;
 		step->shade_t = t + random_jitter_offset;
 
-		segment->numsteps++;
-
 		/* stop if at the end of the volume */
 		t = new_t;
 		if(t == ray->t)
@@ -729,16 +745,13 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 
 ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment)
 {
-	free(segment->steps);
+	if(segment->steps != &segment->stack_step)
+		free(segment->steps);
 }
 
 /* scattering for homogeneous and heterogeneous volumes, using decoupled ray
- * marching. unlike the non-decoupled functions, these do not do probalistic
- * scattering, they always scatter if there is any non-zero scattering
- * coefficient.
+ * marching. this function does not do emission or modify throughput. 
  *
- * these also do not do emission or modify throughput. 
- * 
  * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */
 ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd,
@@ -753,7 +766,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	sd->randb_closure = rphase*3.0f - channel;
 	float xi = rscatter;
 
-	/* probalistic scattering decision based on transmittance */
+	/* probabilistic scattering decision based on transmittance */
 	if(probalistic_scatter) {
 		float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel);
 
@@ -833,7 +846,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 		float3 distance_pdf;
 		sample_t = prev_t + kernel_volume_distance_sample(step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf);
 
-		/* modifiy pdf for hit/miss decision */
+		/* modify pdf for hit/miss decision */
 		if(probalistic_scatter)
 			distance_pdf *= make_float3(1.0f, 1.0f, 1.0f) - segment->accum_transmittance;
 
@@ -929,7 +942,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 /* decide if we need to use decoupled or not */
 ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method)
 {
-	/* decoupled ray marching for heterogenous volumes not supported on the GPU,
+	/* decoupled ray marching for heterogeneous volumes not supported on the GPU,
 	 * which also means equiangular and multiple importance sampling is not
 	 * support for that case */
 #ifdef __KERNEL_GPU__
@@ -958,7 +971,7 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
                                          Ray *ray,
                                          VolumeStack *stack)
 {
-	/* NULL ray happens in the baker, does it need proper initializetion of
+	/* NULL ray happens in the baker, does it need proper initialization of
 	 * camera in volume?
 	 */
 	if(!kernel_data.cam.is_inside_volume || ray == NULL) {
@@ -976,25 +989,26 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
 		return;
 	}
 
-	const float3 Pend = ray->P + ray->D*ray->t;
 	Ray volume_ray = *ray;
-	int stack_index = 0, enclosed_index = 0;
-	int enclosed_volumes[VOLUME_STACK_SIZE];
+	volume_ray.t = FLT_MAX;
 
-	while(stack_index < VOLUME_STACK_SIZE - 1 &&
-	      enclosed_index < VOLUME_STACK_SIZE - 1)
-	{
-		Intersection isect;
-		bool hit = scene_intersect(kg, &volume_ray, PATH_RAY_ALL_VISIBILITY,
-		                           &isect,
-		                           NULL, 0.0f, 0.0f);
-		if(!hit) {
-			break;
-		}
+	int stack_index = 0, enclosed_index = 0;
 
-		ShaderData sd;
-		shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
-		if(sd.flag & SD_HAS_VOLUME) {
+#ifdef __VOLUME_RECORD_ALL__
+	Intersection hits[2*VOLUME_STACK_SIZE];
+	uint num_hits = scene_intersect_volume_all(kg,
+	                                           &volume_ray,
+	                                           hits,
+	                                           2*VOLUME_STACK_SIZE);
+	if(num_hits > 0) {
+		int enclosed_volumes[VOLUME_STACK_SIZE];
+		Intersection *isect = hits;
+
+		qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+		for(uint hit = 0; hit < num_hits; ++hit, ++isect) {
+			ShaderData sd;
+			shader_setup_from_ray(kg, &sd, isect, &volume_ray, 0, 0);
 			if(sd.flag & SD_BACKFACING) {
 				/* If ray exited the volume and never entered to that volume
 				 * it means that camera is inside such a volume.
@@ -1014,24 +1028,56 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
 			}
 			else {
 				/* If ray from camera enters the volume, this volume shouldn't
-				 * be added to the stak on exit.
+				 * be added to the stack on exit.
 				 */
 				enclosed_volumes[enclosed_index++] = sd.object;
 			}
 		}
+	}
+#else
+	int enclosed_volumes[VOLUME_STACK_SIZE];
+	int step = 0;
 
-		/* Move ray forward. */
-		volume_ray.P = ray_offset(sd.P, -sd.Ng);
-		if(volume_ray.t != FLT_MAX) {
-			volume_ray.D = normalize_len(Pend - volume_ray.P, &volume_ray.t);
-			/* TODO(sergey): Find a faster way detecting that ray_offset moved
-			 * us pass through the end point.
+	while(stack_index < VOLUME_STACK_SIZE - 1 &&
+	      enclosed_index < VOLUME_STACK_SIZE - 1 &&
+	      step < 2 * VOLUME_STACK_SIZE)
+	{
+		Intersection isect;
+		if(!scene_intersect_volume(kg, &volume_ray, &isect)) {
+			break;
+		}
+
+		ShaderData sd;
+		shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
+		if(sd.flag & SD_BACKFACING) {
+			/* If ray exited the volume and never entered to that volume
+			 * it means that camera is inside such a volume.
 			 */
-			if(dot(ray->D, volume_ray.D) < 0.0f) {
-				break;
+			bool is_enclosed = false;
+			for(int i = 0; i < enclosed_index; ++i) {
+				if(enclosed_volumes[i] == sd.object) {
+					is_enclosed = true;
+					break;
+				}
 			}
+			if(is_enclosed == false) {
+				stack[stack_index].object = sd.object;
+				stack[stack_index].shader = sd.shader;
+				++stack_index;
+			}
+		}
+		else {
+			/* If ray from camera enters the volume, this volume shouldn't
+			 * be added to the stack on exit.
+			 */
+			enclosed_volumes[enclosed_index++] = sd.object;
 		}
+
+		/* Move ray forward. */
+		volume_ray.P = ray_offset(sd.P, -sd.Ng);
+		++step;
 	}
+#endif
 	/* stack_index of 0 means quick checks outside of the kernel gave false
 	 * positive, nothing to worry about, just we've wasted quite a few of
 	 * ticks just to come into conclusion that camera is in the air.
@@ -1094,4 +1140,49 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd
 	}
 }
 
+#ifdef __SUBSURFACE__
+ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
+                                                          Ray *ray,
+                                                          VolumeStack *stack)
+{
+	kernel_assert(kernel_data.integrator.use_volumes);
+
+	Ray volume_ray = *ray;
+
+#ifdef __VOLUME_RECORD_ALL__
+	Intersection hits[2*VOLUME_STACK_SIZE];
+	uint num_hits = scene_intersect_volume_all(kg,
+	                                           &volume_ray,
+	                                           hits,
+	                                           2*VOLUME_STACK_SIZE);
+	if(num_hits > 0) {
+		Intersection *isect = hits;
+
+		qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+		for(uint hit = 0; hit < num_hits; ++hit, ++isect) {
+			ShaderData sd;
+			shader_setup_from_ray(kg, &sd, isect, &volume_ray, 0, 0);
+			kernel_volume_stack_enter_exit(kg, &sd, stack);
+		}
+	}
+#else
+	Intersection isect;
+	int step = 0;
+	while(step < 2 * VOLUME_STACK_SIZE &&
+	      scene_intersect_volume(kg, &volume_ray, &isect))
+	{
+		ShaderData sd;
+		shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
+		kernel_volume_stack_enter_exit(kg, &sd, stack);
+
+		/* Move ray forward. */
+		volume_ray.P = ray_offset(sd.P, -sd.Ng);
+		volume_ray.t -= sd.ray_length;
+		++step;
+	}
+#endif
+}
+#endif
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
new file mode 100644
index 00000000000..9b83d972e97
--- /dev/null
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_WORK_STEALING_H__
+#define __KERNEL_WORK_STEALING_H__
+
+/*
+ * Utility functions for work stealing
+ */
+
+#ifdef __WORK_STEALING__
+
+#ifdef __KERNEL_OPENCL__
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#endif
+
+uint get_group_id_with_ray_index(uint ray_index,
+                                 uint tile_dim_x,
+                                 uint tile_dim_y,
+                                 uint parallel_samples,
+                                 int dim)
+{
+	if(dim == 0) {
+		uint x_span = ray_index % (tile_dim_x * parallel_samples);
+		return x_span / get_local_size(0);
+	}
+	else /*if(dim == 1)*/ {
+		kernel_assert(dim == 1);
+		uint y_span = ray_index / (tile_dim_x * parallel_samples);
+		return y_span / get_local_size(1);
+	}
+}
+
+uint get_total_work(uint tile_dim_x,
+                    uint tile_dim_y,
+                    uint grp_idx,
+                    uint grp_idy,
+                    uint num_samples)
+{
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	return threads_within_tile_border_x *
+	       threads_within_tile_border_y *
+	       num_samples;
+}
+
+/* Returns 0 in case there is no next work available */
+/* Returns 1 in case work assigned is valid */
+int get_next_work(ccl_global uint *work_pool,
+                  ccl_private uint *my_work,
+                  uint tile_dim_x,
+                  uint tile_dim_y,
+                  uint num_samples,
+                  uint parallel_samples,
+                  uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint total_work = get_total_work(tile_dim_x,
+	                                 tile_dim_y,
+	                                 grp_idx,
+	                                 grp_idy,
+	                                 num_samples);
+	uint group_index = grp_idy * get_num_groups(0) + grp_idx;
+	*my_work = atomic_inc(&work_pool[group_index]);
+	return (*my_work < total_work) ? 1 : 0;
+}
+
+/* This function assumes that the passed my_work is valid. */
+/* Decode sample number w.r.t. assigned my_work. */
+uint get_my_sample(uint my_work,
+                   uint tile_dim_x,
+                   uint tile_dim_y,
+                   uint parallel_samples,
+                   uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	return my_work /
+	       (threads_within_tile_border_x * threads_within_tile_border_y);
+}
+
+/* Decode pixel and tile position w.r.t. assigned my_work. */
+void get_pixel_tile_position(ccl_private uint *pixel_x,
+                             ccl_private uint *pixel_y,
+                             ccl_private uint *tile_x,
+                             ccl_private uint *tile_y,
+                             uint my_work,
+                             uint tile_dim_x,
+                             uint tile_dim_y,
+                             uint tile_offset_x,
+                             uint tile_offset_y,
+                             uint parallel_samples,
+                             uint ray_index)
+{
+	uint grp_idx = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           0);
+	uint grp_idy = get_group_id_with_ray_index(ray_index,
+	                                           tile_dim_x,
+	                                           tile_dim_y,
+	                                           parallel_samples,
+	                                           1);
+	uint threads_within_tile_border_x =
+		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+		                                     : get_local_size(0);
+	uint threads_within_tile_border_y =
+		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+		                                     : get_local_size(1);
+
+	threads_within_tile_border_x =
+		(threads_within_tile_border_x == 0) ? get_local_size(0)
+		                                    : threads_within_tile_border_x;
+	threads_within_tile_border_y =
+		(threads_within_tile_border_y == 0) ? get_local_size(1)
+		                                    : threads_within_tile_border_y;
+
+	uint total_associated_pixels =
+		threads_within_tile_border_x * threads_within_tile_border_y;
+	uint work_group_pixel_index = my_work % total_associated_pixels;
+	uint work_group_pixel_x =
+		work_group_pixel_index % threads_within_tile_border_x;
+	uint work_group_pixel_y =
+		work_group_pixel_index / threads_within_tile_border_x;
+
+	*pixel_x =
+		tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
+	*pixel_y =
+		tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
+	*tile_x = *pixel_x - tile_offset_x;
+	*tile_y = *pixel_y - tile_offset_y;
+}
+
+#endif  /* __WORK_STEALING__ */
+
+#endif  /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index fa2113fbb46..2c8d3503c1a 100644
--- a/intern/cycles/kernel/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -11,18 +11,19 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* CPU kernel entry points */
 
-#include "kernel.h"
 #include "kernel_compat_cpu.h"
+#include "kernel.h"
 #include "kernel_math.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
@@ -37,7 +38,14 @@ void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t s
 		assert(0);
 }
 
-void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t width, size_t height, size_t depth, InterpolationType interpolation)
+void kernel_tex_copy(KernelGlobals *kg,
+                     const char *name,
+                     device_ptr mem,
+                     size_t width,
+                     size_t height,
+                     size_t depth,
+                     InterpolationType interpolation,
+                     ExtensionType extension)
 {
 	if(0) {
 	}
@@ -55,7 +63,7 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 		int id = atoi(name + strlen("__tex_image_float_"));
 		int array_index = id;
 
-		if (array_index >= 0 && array_index < MAX_FLOAT_IMAGES) {
+		if(array_index >= 0 && array_index < MAX_FLOAT_IMAGES) {
 			tex = &kg->texture_float_images[array_index];
 		}
 
@@ -63,6 +71,7 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 			tex->data = (float4*)mem;
 			tex->dimensions_set(width, height, depth);
 			tex->interpolation = interpolation;
+			tex->extension = extension;
 		}
 	}
 	else if(strstr(name, "__tex_image")) {
@@ -70,7 +79,7 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 		int id = atoi(name + strlen("__tex_image_"));
 		int array_index = id - MAX_FLOAT_IMAGES;
 
-		if (array_index >= 0 && array_index < MAX_BYTE_IMAGES) {
+		if(array_index >= 0 && array_index < MAX_BYTE_IMAGES) {
 			tex = &kg->texture_byte_images[array_index];
 		}
 
@@ -78,6 +87,7 @@ void kernel_tex_copy(KernelGlobals *kg, const char *name, device_ptr mem, size_t
 			tex->data = (uchar4*)mem;
 			tex->dimensions_set(width, height, depth);
 			tex->interpolation = interpolation;
+			tex->extension = extension;
 		}
 	}
 	else
diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index e7ff21a6f09..df77bedc729 100644
--- a/intern/cycles/kernel/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Optimized CPU kernel entry points. This file is compiled with AVX
@@ -31,13 +31,14 @@
  
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 
-#include "kernel.h"
 #include "kernel_compat_cpu.h"
+#include "kernel.h"
 #include "kernel_math.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index cb1662bbfbe..b3192369794 100644
--- a/intern/cycles/kernel/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Optimized CPU kernel entry points. This file is compiled with AVX2
@@ -32,13 +32,14 @@
  
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 
-#include "kernel.h"
 #include "kernel_compat_cpu.h"
+#include "kernel.h"
 #include "kernel_math.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index 740998e8c92..f9c5134e442 100644
--- a/intern/cycles/kernel/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Optimized CPU kernel entry points. This file is compiled with SSE2
@@ -27,13 +27,14 @@
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 
-#include "kernel.h"
 #include "kernel_compat_cpu.h"
+#include "kernel.h"
 #include "kernel_math.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index da73a3a1c97..2dbe4b81821 100644
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
@@ -29,13 +29,14 @@
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 
-#include "kernel.h"
 #include "kernel_compat_cpu.h"
+#include "kernel.h"
 #include "kernel_math.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index 5704f60e138..5c57ad01181 100644
--- a/intern/cycles/kernel/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
@@ -30,13 +30,14 @@
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 
-#include "kernel.h"
 #include "kernel_compat_cpu.h"
+#include "kernel.h"
 #include "kernel_math.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
+#include "kernel_path_branched.h"
 #include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 9ed4592f604..bcd55b8c676 100644
--- a/intern/cycles/kernel/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -11,18 +11,19 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* CUDA kernel entry points */
 
-#include "kernel_compat_cuda.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_bake.h"
+#include "../../kernel_compat_cuda.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
+#include "../../kernel_film.h"
+#include "../../kernel_path.h"
+#include "../../kernel_path_branched.h"
+#include "../../kernel_bake.h"
 
 /* device data taken from CUDA occupancy calculator */
 
@@ -52,6 +53,18 @@
 #define CUDA_KERNEL_MAX_REGISTERS 63
 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
 
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 63
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
 /* 5.0 and 5.2 */
 #elif __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 520
 #define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index 4f20ef9ca15..57db6fd9098 100644
--- a/intern/cycles/kernel/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -11,19 +11,22 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 /* OpenCL kernel entry points - unfinished */
 
-#include "kernel_compat_opencl.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
+#include "../../kernel_compat_opencl.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
 
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_bake.h"
+#include "../../kernel_film.h"
+#include "../../kernel_path.h"
+#include "../../kernel_path_branched.h"
+#include "../../kernel_bake.h"
+
+#ifdef __COMPILE_ONLY_MEGAKERNEL__
 
 __kernel void kernel_ocl_path_trace(
 	ccl_constant KernelData *data,
@@ -32,7 +35,7 @@ __kernel void kernel_ocl_path_trace(
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int sample,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -43,7 +46,7 @@ __kernel void kernel_ocl_path_trace(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
 	int y = sy + get_global_id(1);
@@ -52,17 +55,18 @@ __kernel void kernel_ocl_path_trace(
 		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
 }
 
-__kernel void kernel_ocl_convert_to_byte(
+#else // __COMPILE_ONLY_MEGAKERNEL__
+
+__kernel void kernel_ocl_shader(
 	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
+	int type, int sx, int sw, int offset, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -70,26 +74,24 @@ __kernel void kernel_ocl_convert_to_byte(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
 
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+	if(x < sx + sw)
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
 }
 
-__kernel void kernel_ocl_convert_to_half_float(
+__kernel void kernel_ocl_bake(
 	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
+	int type, int sx, int sw, int offset, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -97,25 +99,30 @@ __kernel void kernel_ocl_convert_to_half_float(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
 
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+	if(x < sx + sw) {
+#ifdef __NO_BAKING__
+		output[x] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#else
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
+#endif
+	}
 }
 
-__kernel void kernel_ocl_shader(
+__kernel void kernel_ocl_convert_to_byte(
 	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
+	ccl_global uchar4 *rgba,
+	ccl_global float *buffer,
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
-	int type, int sx, int sw, int offset, int sample)
+	float sample_scale,
+	int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -123,24 +130,26 @@ __kernel void kernel_ocl_shader(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
+	int y = sy + get_global_id(1);
 
-	if(x < sx + sw)
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
-__kernel void kernel_ocl_bake(
+__kernel void kernel_ocl_convert_to_half_float(
 	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
+	ccl_global uchar4 *rgba,
+	ccl_global float *buffer,
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
-	int type, int sx, int sw, int offset, int sample)
+	float sample_scale,
+	int sx, int sy, int sw, int sh, int offset, int stride)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -148,11 +157,13 @@ __kernel void kernel_ocl_bake(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "kernel_textures.h"
+#include "../../kernel_textures.h"
 
 	int x = sx + get_global_id(0);
+	int y = sy + get_global_id(1);
 
-	if(x < sx + sw)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
+#endif // __COMPILE_ONLY_MEGAKERNEL__
+\ No newline at end of file
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
new file mode 100644
index 00000000000..eff77b89a0a
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
@@ -0,0 +1,128 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_background_buffer_update.h"
+
+__kernel void kernel_ocl_path_trace_background_buffer_update(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,             /* Required for buffer Update */
+        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
+        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
+        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
+        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
+        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
+        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
+        int sw, int sh, int sx, int sy, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
+        ccl_global int *Queue_data,            /* Queues memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+        int end_sample,
+        int start_sample,
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(ray_index == 0) {
+		/* We will empty this queue in this kernel. */
+		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+	char enqueue_flag = 0;
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag =
+			kernel_background_buffer_update(globals,
+			                                data,
+			                                shader_data,
+			                                per_sample_output_buffers,
+			                                rng_state,
+			                                rng_coop,
+			                                throughput_coop,
+			                                PathRadiance_coop,
+			                                Ray_coop,
+			                                PathState_coop,
+			                                L_transparent_coop,
+			                                ray_state,
+			                                sw, sh, sx, sy, stride,
+			                                rng_state_offset_x,
+			                                rng_state_offset_y,
+			                                rng_state_stride,
+			                                work_array,
+			                                end_sample,
+			                                start_sample,
+#ifdef __WORK_STEALING__
+			                                work_pool_wgs,
+			                                num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+			                                debugdata_coop,
+#endif
+			                                parallel_samples,
+			                                ray_index);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	 * These rays will be made active during next SceneIntersectkernel.
+	 */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
new file mode 100644
index 00000000000..c3277676029
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -0,0 +1,241 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_data_init.h"
+
+__kernel void kernel_ocl_path_trace_data_init(
+        ccl_global char *globals,
+        ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
+        ccl_global char *shader_data_sd_DL_shadow,        /* Arguments related to ShaderData */
+
+        ccl_global float3 *P_sd,
+        ccl_global float3 *P_sd_DL_shadow,
+
+        ccl_global float3 *N_sd,
+        ccl_global float3 *N_sd_DL_shadow,
+
+        ccl_global float3 *Ng_sd,
+        ccl_global float3 *Ng_sd_DL_shadow,
+
+        ccl_global float3 *I_sd,
+        ccl_global float3 *I_sd_DL_shadow,
+
+        ccl_global int *shader_sd,
+        ccl_global int *shader_sd_DL_shadow,
+
+        ccl_global int *flag_sd,
+        ccl_global int *flag_sd_DL_shadow,
+
+        ccl_global int *prim_sd,
+        ccl_global int *prim_sd_DL_shadow,
+
+        ccl_global int *type_sd,
+        ccl_global int *type_sd_DL_shadow,
+
+        ccl_global float *u_sd,
+        ccl_global float *u_sd_DL_shadow,
+
+        ccl_global float *v_sd,
+        ccl_global float *v_sd_DL_shadow,
+
+        ccl_global int *object_sd,
+        ccl_global int *object_sd_DL_shadow,
+
+        ccl_global float *time_sd,
+        ccl_global float *time_sd_DL_shadow,
+
+        ccl_global float *ray_length_sd,
+        ccl_global float *ray_length_sd_DL_shadow,
+
+        ccl_global int *ray_depth_sd,
+        ccl_global int *ray_depth_sd_DL_shadow,
+
+        ccl_global int *transparent_depth_sd,
+        ccl_global int *transparent_depth_sd_DL_shadow,
+
+        /* Ray differentials. */
+        ccl_global differential3 *dP_sd,
+        ccl_global differential3 *dP_sd_DL_shadow,
+
+        ccl_global differential3 *dI_sd,
+        ccl_global differential3 *dI_sd_DL_shadow,
+
+        ccl_global differential *du_sd,
+        ccl_global differential *du_sd_DL_shadow,
+
+        ccl_global differential *dv_sd,
+        ccl_global differential *dv_sd_DL_shadow,
+
+        /* Dp/Du */
+        ccl_global float3 *dPdu_sd,
+        ccl_global float3 *dPdu_sd_DL_shadow,
+
+        ccl_global float3 *dPdv_sd,
+        ccl_global float3 *dPdv_sd_DL_shadow,
+
+        /* Object motion. */
+        ccl_global Transform *ob_tfm_sd,
+        ccl_global Transform *ob_tfm_sd_DL_shadow,
+
+        ccl_global Transform *ob_itfm_sd,
+        ccl_global Transform *ob_itfm_sd_DL_shadow,
+
+        ShaderClosure *closure_sd,
+        ShaderClosure *closure_sd_DL_shadow,
+
+        ccl_global int *num_closure_sd,
+        ccl_global int *num_closure_sd_DL_shadow,
+
+        ccl_global float *randb_closure_sd,
+        ccl_global float *randb_closure_sd_DL_shadow,
+
+        ccl_global float3 *ray_P_sd,
+        ccl_global float3 *ray_P_sd_DL_shadow,
+
+        ccl_global differential3 *ray_dP_sd,
+        ccl_global differential3 *ray_dP_sd_DL_shadow,
+
+        ccl_constant KernelData *data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
+        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
+        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
+        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
+        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
+        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
+        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
+
+#define KERNEL_TEX(type, ttype, name)                                   \
+        ccl_global type *name,
+#include "../../kernel_textures.h"
+
+        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global int *Queue_data,                  /* Memory for queues */
+        ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
+        int queuesize,                               /* size (capacity) of the queue */
+        ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
+        unsigned int num_samples,                    /* Total number of samples per pixel */
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                        /* Number of samples to be processed in parallel */
+{
+	kernel_data_init(globals,
+	                 shader_data_sd,
+	                 shader_data_sd_DL_shadow,
+	                 P_sd,
+	                 P_sd_DL_shadow,
+	                 N_sd,
+	                 N_sd_DL_shadow,
+	                 Ng_sd,
+	                 Ng_sd_DL_shadow,
+	                 I_sd,
+	                 I_sd_DL_shadow,
+	                 shader_sd,
+	                 shader_sd_DL_shadow,
+	                 flag_sd,
+	                 flag_sd_DL_shadow,
+	                 prim_sd,
+	                 prim_sd_DL_shadow,
+	                 type_sd,
+	                 type_sd_DL_shadow,
+	                 u_sd,
+	                 u_sd_DL_shadow,
+	                 v_sd,
+	                 v_sd_DL_shadow,
+	                 object_sd,
+	                 object_sd_DL_shadow,
+	                 time_sd,
+	                 time_sd_DL_shadow,
+	                 ray_length_sd,
+	                 ray_length_sd_DL_shadow,
+	                 ray_depth_sd,
+	                 ray_depth_sd_DL_shadow,
+	                 transparent_depth_sd,
+	                 transparent_depth_sd_DL_shadow,
+
+	                 /* Ray differentials. */
+	                 dP_sd,
+	                 dP_sd_DL_shadow,
+	                 dI_sd,
+	                 dI_sd_DL_shadow,
+	                 du_sd,
+	                 du_sd_DL_shadow,
+	                 dv_sd,
+	                 dv_sd_DL_shadow,
+
+	                 /* Dp/Du */
+	                 dPdu_sd,
+	                 dPdu_sd_DL_shadow,
+	                 dPdv_sd,
+	                 dPdv_sd_DL_shadow,
+
+	                 /* Object motion. */
+	                 ob_tfm_sd,
+	                 ob_tfm_sd_DL_shadow,
+	                 ob_itfm_sd,
+	                 ob_itfm_sd_DL_shadow,
+
+	                 closure_sd,
+	                 closure_sd_DL_shadow,
+	                 num_closure_sd,
+	                 num_closure_sd_DL_shadow,
+	                 randb_closure_sd,
+	                 randb_closure_sd_DL_shadow,
+	                 ray_P_sd,
+	                 ray_P_sd_DL_shadow,
+	                 ray_dP_sd,
+	                 ray_dP_sd_DL_shadow,
+	                 data,
+	                 per_sample_output_buffers,
+	                 rng_state,
+	                 rng_coop,
+	                 throughput_coop,
+	                 L_transparent_coop,
+	                 PathRadiance_coop,
+	                 Ray_coop,
+	                 PathState_coop,
+	                 ray_state,
+
+#define KERNEL_TEX(type, ttype, name) name,
+#include "../../kernel_textures.h"
+
+	                 start_sample, sx, sy, sw, sh, offset, stride,
+	                 rng_state_offset_x,
+	                 rng_state_offset_y,
+	                 rng_state_stride,
+	                 Queue_data,
+	                 Queue_index,
+	                 queuesize,
+	                 use_queues_flag,
+	                 work_array,
+#ifdef __WORK_STEALING__
+	                 work_pool_wgs,
+	                 num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+	                 debugdata_coop,
+#endif
+	                 parallel_samples);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
new file mode 100644
index 00000000000..6ec75013b3a
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_direct_lighting.h"
+
+__kernel void kernel_ocl_path_trace_direct_lighting(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,           /* Required for direct lighting */
+        ccl_global char *shader_DL,             /* Required for direct lighting */
+        ccl_global uint *rng_coop,              /* Required for direct lighting */
+        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
+        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
+        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
+        ccl_global char *ray_state,             /* Denotes the state of each ray */
+        ccl_global int *Queue_data,             /* Queue memory */
+        ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
+        int queuesize)                          /* Size (capacity) of each queue */
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag = kernel_direct_lighting(globals,
+		                                      data,
+		                                      shader_data,
+		                                      shader_DL,
+		                                      rng_coop,
+		                                      PathState_coop,
+		                                      ISLamp_coop,
+		                                      LightRay_coop,
+		                                      BSDFEval_coop,
+		                                      ray_state,
+		                                      ray_index);
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+#ifdef __EMISSION__
+	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+#endif
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
new file mode 100644
index 00000000000..ae5f5cd1b3b
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+
+__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required throughout the kernel except probabilistic path termination and AO */
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
+        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
+        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
+        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
+        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
+        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
+        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
+        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
+        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
+        int sw, int sh, int sx, int sy, int stride,
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
+        ccl_global int *Queue_data,            /* Queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+#ifdef __WORK_STEALING__
+        unsigned int start_sample,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	ccl_local unsigned int local_queue_atomics_bg;
+	ccl_local unsigned int local_queue_atomics_ao;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics_bg = 0;
+		local_queue_atomics_ao = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif  /* __COMPUTE_DEVICE_GPU__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		kernel_holdout_emission_blurring_pathtermination_ao(
+		        globals,
+		        data,
+		        shader_data,
+		        per_sample_output_buffers,
+		        rng_coop,
+		        throughput_coop,
+		        L_transparent_coop,
+		        PathRadiance_coop,
+		        PathState_coop,
+		        Intersection_coop,
+		        AOAlpha_coop,
+		        AOBSDF_coop,
+		        AOLightRay_coop,
+		        sw, sh, sx, sy, stride,
+		        ray_state,
+		        work_array,
+#ifdef __WORK_STEALING__
+		        start_sample,
+#endif
+		        parallel_samples,
+		        ray_index,
+		        &enqueue_flag,
+		        &enqueue_flag_AO_SHADOW_RAY_CAST);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics_bg,
+	                        Queue_data,
+	                        Queue_index);
+
+#ifdef __AO__
+	/* Enqueue to-shadow-ray-cast rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+	                        enqueue_flag_AO_SHADOW_RAY_CAST,
+	                        queuesize,
+	                        &local_queue_atomics_ao,
+	                        Queue_data,
+	                        Queue_index);
+#endif
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
new file mode 100644
index 00000000000..1bc7808d834
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_lamp_emission.h"
+
+__kernel void kernel_ocl_path_trace_lamp_emission(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required for lamp emission */
+        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
+        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
+        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
+        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
+        Intersection *Intersection_coop,       /* Required for lamp emission */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global int *Queue_data,            /* Memory for queues */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
+        int queuesize,                         /* Size (capacity) of queues */
+        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
+                                                * queues to fetch ray index
+                                                */
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	/* We will empty this queue in this kernel. */
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+	}
+	/* Fetch use_queues_flag. */
+	ccl_local char local_use_queues_flag;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_use_queues_flag = use_queues_flag[0];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index;
+	if(local_use_queues_flag) {
+		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+		ray_index = get_ray_index(thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          Queue_data,
+		                          queuesize,
+		                          1);
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	} else {
+		if(x < (sw * parallel_samples) && y < sh){
+			ray_index = x + y * (sw * parallel_samples);
+		} else {
+			return;
+		}
+	}
+
+	kernel_lamp_emission(globals,
+	                     data,
+	                     shader_data,
+	                     throughput_coop,
+	                     PathRadiance_coop,
+	                     Ray_coop,
+	                     PathState_coop,
+	                     Intersection_coop,
+	                     ray_state,
+	                     sw, sh,
+	                     use_queues_flag,
+	                     parallel_samples,
+	                     ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
new file mode 100644
index 00000000000..dcf4db40411
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_next_iteration_setup.h"
+
+__kernel void kernel_ocl_path_trace_next_iteration_setup(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,         /* Required for setting up ray for next iteration */
+        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
+        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
+        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
+        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
+        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
+        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
+        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
+        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
+        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
+        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
+        ccl_global char *ray_state,           /* Denotes the state of each ray */
+        ccl_global int *Queue_data,           /* Queue memory */
+        ccl_global int *Queue_index,          /* Tracks the number of elements in each queue */
+        int queuesize,                        /* Size (capacity) of each queue */
+        ccl_global char *use_queues_flag)     /* flag to decide if scene_intersect kernel should
+                                               * use queues to fetch ray index */
+{
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		/* If we are here, then it means that scene-intersect kernel
+		* has already been executed atleast once. From the next time,
+		* scene-intersect kernel may operate on queues to fetch ray index
+		*/
+		use_queues_flag[0] = 1;
+
+		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
+		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
+		 * previous kernel.
+		 */
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+
+	char enqueue_flag = 0;
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+		enqueue_flag = kernel_next_iteration_setup(globals,
+		                                           data,
+		                                           shader_data,
+		                                           rng_coop,
+		                                           throughput_coop,
+		                                           PathRadiance_coop,
+		                                           Ray_coop,
+		                                           PathState_coop,
+		                                           LightRay_dl_coop,
+		                                           ISLamp_coop,
+		                                           BSDFEval_coop,
+		                                           LightRay_ao_coop,
+		                                           AOBSDF_coop,
+		                                           AOAlpha_coop,
+		                                           ray_state,
+		                                           use_queues_flag,
+		                                           ray_index);
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
new file mode 100644
index 00000000000..3156dc255fb
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../kernel_compat_opencl.h"
+#include "../../kernel_math.h"
+#include "../../kernel_types.h"
+#include "../../kernel_globals.h"
+#include "../../kernel_queues.h"
+
+/*
+ * The kernel "kernel_queue_enqueue" enqueues rays of
+ * different ray state into their appropriate Queues;
+ * 1. Rays that have been determined to hit the background from the
+ * "kernel_scene_intersect" kernel
+ * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output of the kernel is as follows,
+ *
+ * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
+ * queuesize -------------------------------------------|                           |
+ *
+ * Note on Queues :
+ * State of queues during the first time this kernel is called :
+ * At entry,
+ * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
+ *
+ * State of queue during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
+__kernel void kernel_ocl_path_trace_queue_enqueue(
+        ccl_global int *Queue_data,   /* Queue memory */
+        ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
+        ccl_global char *ray_state,   /* Denotes the state of each ray */
+        int queuesize)                /* Size (capacity) of each queue */
+{
+	/* We have only 2 cases (Hit/Not-Hit) */
+	ccl_local unsigned int local_queue_atomics[2];
+
+	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+	if(lidx < 2 ) {
+		local_queue_atomics[lidx] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int queue_number = -1;
+
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+	}
+	else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	}
+
+	unsigned int my_lqidx;
+	if(queue_number != -1) {
+		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if(lidx == 0) {
+		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
+		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                                    local_queue_atomics,
+		                                    Queue_index);
+		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
+		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+		                                    local_queue_atomics,
+		                                    Queue_index);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	unsigned int my_gqidx;
+	if(queue_number != -1) {
+		my_gqidx = get_global_queue_index(queue_number,
+		                                  queuesize,
+		                                  my_lqidx,
+		                                  local_queue_atomics);
+		Queue_data[my_gqidx] = ray_index;
+	}
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
new file mode 100644
index 00000000000..e5fad7bce50
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_scene_intersect.h"
+
+__kernel void kernel_ocl_path_trace_scene_intersect(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global uint *rng_coop,
+        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
+        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
+        Intersection *Intersection_coop,       /* Required for scene_intersect */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global int *Queue_data,            /* Memory for queues */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
+        int queuesize,                         /* Size (capacity) of queues */
+        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
+                                                * queues to fetch ray index */
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                  /* Number of samples to be processed in parallel */
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	/* Fetch use_queues_flag */
+	ccl_local char local_use_queues_flag;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_use_queues_flag = use_queues_flag[0];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index;
+	if(local_use_queues_flag) {
+		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+		ray_index = get_ray_index(thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          Queue_data,
+		                          queuesize,
+		                          0);
+
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	} else {
+		if(x < (sw * parallel_samples) && y < sh){
+			ray_index = x + y * (sw * parallel_samples);
+		} else {
+			return;
+		}
+	}
+
+	kernel_scene_intersect(globals,
+	                       data,
+	                       rng_coop,
+	                       Ray_coop,
+	                       PathState_coop,
+	                       Intersection_coop,
+	                       ray_state,
+	                       sw, sh,
+	                       use_queues_flag,
+#ifdef __KERNEL_DEBUG__
+	                       debugdata_coop,
+#endif
+	                       parallel_samples,
+	                       ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
new file mode 100644
index 00000000000..b9f616e6bdf
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_shader_eval.h"
+
+__kernel void kernel_ocl_path_trace_shader_eval(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Output ShaderData structure to be filled */
+        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
+        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
+        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
+        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        ccl_global int *Queue_data,            /* queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize)                         /* Size (capacity) of each queue */
+{
+	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+	ccl_local unsigned int local_queue_atomics;
+	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	ray_index = get_ray_index(ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          Queue_data,
+	                          queuesize,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        queuesize,
+	                        &local_queue_atomics,
+	                        Queue_data,
+	                        Queue_index);
+
+	/* Continue on with shader evaluation. */
+	kernel_shader_eval(globals,
+	                   data,
+	                   shader_data,
+	                   rng_coop,
+	                   Ray_coop,
+	                   PathState_coop,
+	                   Intersection_coop,
+	                   ray_state,
+	                   ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
new file mode 100644
index 00000000000..03886c0a030
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_shadow_blocked.h"
+
+__kernel void kernel_ocl_path_trace_shadow_blocked(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_shadow,        /* Required for shadow blocked */
+        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
+        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
+        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
+        Intersection *Intersection_coop_AO,
+        Intersection *Intersection_coop_DL,
+        ccl_global char *ray_state,
+        ccl_global int *Queue_data,            /* Queue memory */
+        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
+        int queuesize,                         /* Size (capacity) of each queue */
+        int total_num_rays)
+{
+#if 0
+	/* We will make the Queue_index entries '0' in the next kernel. */
+	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+		/* We empty this queue here */
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+#endif
+
+	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
+
+	ccl_local unsigned int ao_queue_length;
+	ccl_local unsigned int dl_queue_length;
+	if(lidx == 0) {
+		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	/* flag determining if the current ray is to process shadow ray for AO or DL */
+	char shadow_blocked_type = -1;
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+	if(thread_index < ao_queue_length + dl_queue_length) {
+		if(thread_index < ao_queue_length) {
+			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
+		} else {
+			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
+		}
+	}
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
+	kernel_shadow_blocked(globals,
+	                      data,
+	                      shader_shadow,
+	                      PathState_coop,
+	                      LightRay_dl_coop,
+	                      LightRay_ao_coop,
+	                      Intersection_coop_AO,
+	                      Intersection_coop_DL,
+	                      ray_state,
+	                      total_num_rays,
+	                      shadow_blocked_type,
+	                      ray_index);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
new file mode 100644
index 00000000000..88a1ed830af
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "split/kernel_sum_all_radiance.h"
+
+__kernel void kernel_ocl_path_trace_sum_all_radiance(
+        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
+        ccl_global float *buffer,                    /* Output buffer of RenderTile */
+        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
+        int parallel_samples, int sw, int sh, int stride,
+        int buffer_offset_x,
+        int buffer_offset_y,
+        int buffer_stride,
+        int start_sample)
+{
+	kernel_sum_all_radiance(data,
+	                        buffer,
+	                        per_sample_output_buffer,
+	                        parallel_samples,
+	                        sw, sh, stride,
+	                        buffer_offset_x,
+	                        buffer_offset_y,
+	                        buffer_stride,
+	                        start_sample);
+}
diff --git a/intern/cycles/kernel/osl/SConscript b/intern/cycles/kernel/osl/SConscript
index 4685bb7753e..74ba5e1020c 100644
--- a/intern/cycles/kernel/osl/SConscript
+++ b/intern/cycles/kernel/osl/SConscript
@@ -38,11 +38,38 @@ incs.append(env['BF_OIIO_INC'])
 incs.append(env['BF_BOOST_INC'])
 incs.append(env['BF_OSL_INC'])
 incs.append(env['BF_OPENEXR_INC'].split())
+incs.append('#/intern/atomic')
 
 defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {')
 defs.append('CCL_NAMESPACE_END=}')
 defs.append('WITH_OSL')
 
+if env['WITH_UNORDERED_MAP_SUPPORT']:
+    if env['UNORDERED_MAP_HEADER'] == 'unordered_map':
+        if env['UNORDERED_MAP_NAMESPACE'] == 'std':
+            defs.append('CYCLES_STD_UNORDERED_MAP')
+        elif env['UNORDERED_MAP_NAMESPACE'] == 'std::tr1':
+            defs.append('CYCLES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE')
+    elif env['UNORDERED_MAP_NAMESPACE'] == 'std::tr1':
+        defs.append('CYCLES_TR1_UNORDERED_MAP')
+else:
+    print("-- Replacing unordered_map/set with map/set (warning: slower!)")
+    defs.append('CYCLES_NO_UNORDERED_MAP')
+
+if env['WITH_BF_CYCLES_DEBUG']:
+    defs.append('WITH_CYCLES_DEBUG')
+
+if env['WITH_BF_CYCLES_LOGGING']:
+    defs.append('WITH_CYCLES_LOGGING')
+    defs.append('GOOGLE_GLOG_DLL_DECL=')
+    defs.append('CYCLES_GFLAGS_NAMESPACE=gflags')
+    if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', 'win64-mingw'):
+        incs.append('#extern/libmv/third_party/glog/src/windows')
+        incs.append('#extern/libmv/third_party/gflags')
+    else:
+        incs.append('#extern/libmv/third_party/glog/src')
+        incs.append('#extern/libmv/third_party/gflags')
+
 if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
     cxxflags.append('-DBOOST_NO_RTTI -DBOOST_NO_TYPEID /fp:fast'.split())
     incs.append(env['BF_PTHREADS_INC'])
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
index 2facced0914..4d70bc80006 100644
--- a/intern/cycles/kernel/osl/background.cpp
+++ b/intern/cycles/kernel/osl/background.cpp
@@ -77,7 +77,7 @@ public:
 ClosureParam *closure_background_params()
 {
 	static ClosureParam params[] = {
-	    CLOSURE_STRING_KEYPARAM("label"),
+	    CLOSURE_STRING_KEYPARAM(GenericBackgroundClosure, label, "label"),
 	    CLOSURE_FINISH_PARAM(GenericBackgroundClosure)
 	};
 	return params;
@@ -98,7 +98,7 @@ CCLOSURE_PREPARE(closure_holdout_prepare, HoldoutClosure)
 ClosureParam *closure_ambient_occlusion_params()
 {
 	static ClosureParam params[] = {
-	    CLOSURE_STRING_KEYPARAM("label"),
+	    CLOSURE_STRING_KEYPARAM(AmbientOcclusionClosure, label, "label"),
 	    CLOSURE_FINISH_PARAM(AmbientOcclusionClosure)
 	};
 	return params;
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index 8f9c2efd470..b3c71e4a706 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -34,6 +34,7 @@
 
 #include <OSL/genclosure.h>
 
+#include "kernel_compat_cpu.h"
 #include "osl_closures.h"
 
 #include "kernel_types.h"
@@ -92,7 +93,7 @@ ClosureParam *closure_bsdf_diffuse_ramp_params()
 	static ClosureParam params[] = {
 		CLOSURE_FLOAT3_PARAM(DiffuseRampClosure, sc.N),
 		CLOSURE_COLOR_ARRAY_PARAM(DiffuseRampClosure, colors, 8),
-		CLOSURE_STRING_KEYPARAM("label"),
+		CLOSURE_STRING_KEYPARAM(DiffuseRampClosure, label, "label"),
 	    CLOSURE_FINISH_PARAM(DiffuseRampClosure)
 	};
 	return params;
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index c5851747b54..99f510d31ed 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -34,6 +34,7 @@
 
 #include <OSL/genclosure.h>
 
+#include "kernel_compat_cpu.h"
 #include "osl_closures.h"
 
 #include "kernel_types.h"
@@ -92,7 +93,7 @@ ClosureParam *closure_bsdf_phong_ramp_params()
 		CLOSURE_FLOAT3_PARAM(PhongRampClosure, sc.N),
 		CLOSURE_FLOAT_PARAM(PhongRampClosure, sc.data0),
 		CLOSURE_COLOR_ARRAY_PARAM(PhongRampClosure, colors, 8),
-		CLOSURE_STRING_KEYPARAM("label"),
+		CLOSURE_STRING_KEYPARAM(PhongRampClosure, label, "label"),
 	    CLOSURE_FINISH_PARAM(PhongRampClosure)
 	};
 	return params;
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
index 02935542c56..9a95fa57a81 100644
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ b/intern/cycles/kernel/osl/emissive.cpp
@@ -77,7 +77,7 @@ public:
 ClosureParam *closure_emission_params()
 {
 	static ClosureParam params[] = {
-	    CLOSURE_STRING_KEYPARAM("label"),
+	    CLOSURE_STRING_KEYPARAM(GenericEmissiveClosure, label, "label"),
 	    CLOSURE_FINISH_PARAM(GenericEmissiveClosure)
 	};
 	return params;
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 84ef85e089d..bc395922077 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -34,6 +34,7 @@
 
 #include <OSL/genclosure.h>
 
+#include "kernel_compat_cpu.h"
 #include "osl_bssrdf.h"
 #include "osl_closures.h"
 
@@ -68,7 +69,7 @@ ClosureParam *closure_bssrdf_cubic_params()
 		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius),
 		CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.data1),
 		CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.T.x),
-	    CLOSURE_STRING_KEYPARAM("label"),
+	    CLOSURE_STRING_KEYPARAM(CubicBSSRDFClosure, label, "label"),
 	    CLOSURE_FINISH_PARAM(CubicBSSRDFClosure)
 	};
 	return params;
@@ -96,7 +97,7 @@ ClosureParam *closure_bssrdf_gaussian_params()
 		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, sc.N),
 		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius),
 		CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, sc.data1),
-	    CLOSURE_STRING_KEYPARAM("label"),
+	    CLOSURE_STRING_KEYPARAM(GaussianBSSRDFClosure, label, "label"),
 	    CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure)
 	};
 	return params;
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index d7789edcfff..461ce8f7598 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -54,7 +54,6 @@
 #include "closure/bsdf_refraction.h"
 #include "closure/bsdf_transparent.h"
 #include "closure/bsdf_ashikhmin_shirley.h"
-#include "closure/bsdf_westin.h"
 #include "closure/bsdf_toon.h"
 #include "closure/bsdf_hair.h"
 #include "closure/volume.h"
@@ -87,16 +86,6 @@ BSDF_CLOSURE_CLASS_BEGIN(Refraction, refraction, refraction, LABEL_SINGULAR)
 	CLOSURE_FLOAT_PARAM(RefractionClosure, sc.data0),
 BSDF_CLOSURE_CLASS_END(Refraction, refraction)
 
-BSDF_CLOSURE_CLASS_BEGIN(WestinBackscatter, westin_backscatter, westin_backscatter, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(WestinBackscatterClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(WestinBackscatterClosure, sc.data0),
-BSDF_CLOSURE_CLASS_END(WestinBackscatter, westin_backscatter)
-
-BSDF_CLOSURE_CLASS_BEGIN(WestinSheen, westin_sheen, westin_sheen, LABEL_DIFFUSE)
-	CLOSURE_FLOAT3_PARAM(WestinSheenClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(WestinSheenClosure, sc.data0),
-BSDF_CLOSURE_CLASS_END(WestinSheen, westin_sheen)
-
 BSDF_CLOSURE_CLASS_BEGIN(Transparent, transparent, transparent, LABEL_SINGULAR)
 BSDF_CLOSURE_CLASS_END(Transparent, transparent)
 
@@ -164,26 +153,16 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, hair_reflection, LABEL
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data0),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1),
-#ifdef __HAIR__
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.T),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data2),
-#else
-	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1),
-#endif
 BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection)
 
 BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, hair_transmission, LABEL_GLOSSY)
 	CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, sc.N),
 	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, sc.data0),
 	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, sc.data1),
-#ifdef __HAIR__
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.T),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data2),
-#else
-	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1),
-#endif
 BSDF_CLOSURE_CLASS_END(HairTransmission, hair_transmission)
 
 VOLUME_CLOSURE_CLASS_BEGIN(VolumeHenyeyGreenstein, henyey_greenstein, LABEL_VOLUME_SCATTER)
@@ -200,11 +179,7 @@ static void register_closure(OSL::ShadingSystem *ss, const char *name, int id, O
 	/* optimization: it's possible to not use a prepare function at all and
 	 * only initialize the actual class when accessing the closure component
 	 * data, but then we need to map the id to the class somehow */
-#ifdef CLOSURE_PREPARE
-	ss->register_closure(name, id, params, prepare, NULL, NULL);
-#else
-	ss->register_closure(name, id, params, prepare, NULL);
-#endif
+	ss->register_closure(name, id, params, prepare, NULL, 16);
 }
 
 void OSLShader::register_closures(OSLShadingSystem *ss_)
@@ -244,10 +219,6 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare);
 	register_closure(ss, "glossy_toon", id++,
 		bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare);
-	register_closure(ss, "westin_backscatter", id++,
-		bsdf_westin_backscatter_params(), bsdf_westin_backscatter_prepare);
-	register_closure(ss, "westin_sheen", id++,
-		bsdf_westin_sheen_params(), bsdf_westin_sheen_prepare);
 
 	register_closure(ss, "emission", id++,
 		closure_emission_params(), closure_emission_prepare);
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index 58d215295dc..97bd1b1ac92 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -48,8 +48,6 @@ OSL::ClosureParam *closure_holdout_params();
 OSL::ClosureParam *closure_ambient_occlusion_params();
 OSL::ClosureParam *closure_bsdf_diffuse_ramp_params();
 OSL::ClosureParam *closure_bsdf_phong_ramp_params();
-OSL::ClosureParam *closure_westin_backscatter_params();
-OSL::ClosureParam *closure_westin_sheen_params();
 OSL::ClosureParam *closure_bssrdf_cubic_params();
 OSL::ClosureParam *closure_bssrdf_gaussian_params();
 OSL::ClosureParam *closure_henyey_greenstein_volume_params();
@@ -60,8 +58,6 @@ void closure_holdout_prepare(OSL::RendererServices *, int id, void *data);
 void closure_ambient_occlusion_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_diffuse_ramp_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data);
-void closure_westin_backscatter_prepare(OSL::RendererServices *, int id, void *data);
-void closure_westin_sheen_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data);
 void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data);
@@ -82,6 +78,11 @@ void name(RendererServices *, int id, void *data) \
 #define TO_COLOR3(v) OSL::Color3(v.x, v.y, v.z)
 #define TO_FLOAT3(v) make_float3(v[0], v[1], v[2])
 
+#if OSL_LIBRARY_VERSION_CODE < 10700
+#  undef CLOSURE_STRING_KEYPARAM
+#  define CLOSURE_STRING_KEYPARAM(st, fld, key) { TypeDesc::TypeString, 0, key, 0 }
+#endif
+
 /* Closure */
 
 class CClosurePrimitive {
@@ -101,6 +102,10 @@ public:
 	virtual void setup() {}
 
 	Category category;
+
+#if OSL_LIBRARY_VERSION_CODE >= 10700
+	OSL::ustring label;
+#endif
 };
 
 /* BSDF */
@@ -151,14 +156,14 @@ public: \
 \
 	float3 eval_reflect(const float3 &omega_out, const float3 &omega_in, float& pdf) const \
 	{ \
-		pdf = 0; \
-		return make_float3(0, 0, 0); \
+		pdf = 0.0f; \
+		return make_float3(0.0f, 0.0f, 0.0f); \
 	} \
 \
 	float3 eval_transmit(const float3 &omega_out, const float3 &omega_in, float& pdf) const \
 	{ \
-		pdf = 0; \
-		return make_float3(0, 0, 0); \
+		pdf = 0.0f; \
+		return make_float3(0.0f, 0.0f, 0.0f); \
 	} \
 \
 	int sample(const float3 &Ng, \
@@ -179,7 +184,7 @@ static ClosureParam *bsdf_##lower##_params() \
 /* parameters */
 
 #define BSDF_CLOSURE_CLASS_END(Upper, lower) \
-		CLOSURE_STRING_KEYPARAM("label"), \
+		CLOSURE_STRING_KEYPARAM(Upper##Closure, label, "label"), \
 	    CLOSURE_FINISH_PARAM(Upper##Closure) \
 	}; \
 	return params; \
@@ -227,7 +232,7 @@ static ClosureParam *volume_##lower##_params() \
 /* parameters */
 
 #define VOLUME_CLOSURE_CLASS_END(Upper, lower) \
-		CLOSURE_STRING_KEYPARAM("label"), \
+		CLOSURE_STRING_KEYPARAM(Upper##Closure, label, "label"), \
 	    CLOSURE_FINISH_PARAM(Upper##Closure) \
 	}; \
 	return params; \
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 5a658d8244a..e349ac676b0 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __OSL_GLOBALS_H__
@@ -20,7 +20,6 @@
 #ifdef WITH_OSL
 
 #include <OSL/oslexec.h>
-#include <cmath>
 
 #include "util_map.h"
 #include "util_param.h"
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 1475e5a0a62..3c1955a1e1e 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -11,9 +11,18 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
+/* TODO(sergey): There is a bit of headers dependency hell going on
+ * here, so for now we just put here. In the future it might be better
+ * to have dedicated file for such tweaks.
+ */
+#if defined(__GNUC__) && defined(NDEBUG)
+#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#  pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+
 #include <string.h>
 
 #include "mesh.h"
@@ -130,12 +139,12 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
-	if (xform) {
+	if(xform) {
 		const ShaderData *sd = (const ShaderData *)xform;
 		KernelGlobals *kg = sd->osl_globals;
 		int object = sd->object;
 
-		if (object != OBJECT_NONE) {
+		if(object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform tfm;
 
@@ -160,12 +169,12 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
-	if (xform) {
+	if(xform) {
 		const ShaderData *sd = (const ShaderData *)xform;
 		KernelGlobals *kg = sd->osl_globals;
 		int object = sd->object;
 
-		if (object != OBJECT_NONE) {
+		if(object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform itfm;
 
@@ -190,27 +199,27 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 {
 	KernelGlobals *kg = kernel_globals;
 
-	if (from == u_ndc) {
+	if(from == u_ndc) {
 		Transform tfm = transform_transpose(transform_quick_inverse(kernel_data.cam.worldtondc));
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_raster) {
+	else if(from == u_raster) {
 		Transform tfm = transform_transpose(kernel_data.cam.rastertoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_screen) {
+	else if(from == u_screen) {
 		Transform tfm = transform_transpose(kernel_data.cam.screentoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_camera) {
+	else if(from == u_camera) {
 		Transform tfm = transform_transpose(kernel_data.cam.cameratoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_world) {
+	else if(from == u_world) {
 		result.makeIdentity();
 		return true;
 	}
@@ -222,27 +231,27 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 {
 	KernelGlobals *kg = kernel_globals;
 
-	if (to == u_ndc) {
+	if(to == u_ndc) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtondc);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_raster) {
+	else if(to == u_raster) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtoraster);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_screen) {
+	else if(to == u_screen) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtoscreen);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_camera) {
+	else if(to == u_camera) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtocamera);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_world) {
+	else if(to == u_world) {
 		result.makeIdentity();
 		return true;
 	}
@@ -254,11 +263,11 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
-	if (xform) {
+	if(xform) {
 		const ShaderData *sd = (const ShaderData *)xform;
 		int object = sd->object;
 
-		if (object != OBJECT_NONE) {
+		if(object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform tfm = sd->ob_tfm;
 #else
@@ -279,11 +288,11 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
-	if (xform) {
+	if(xform) {
 		const ShaderData *sd = (const ShaderData *)xform;
 		int object = sd->object;
 
-		if (object != OBJECT_NONE) {
+		if(object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 			Transform tfm = sd->ob_itfm;
 #else
@@ -304,22 +313,22 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 {
 	KernelGlobals *kg = kernel_globals;
 
-	if (from == u_ndc) {
+	if(from == u_ndc) {
 		Transform tfm = transform_transpose(transform_quick_inverse(kernel_data.cam.worldtondc));
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_raster) {
+	else if(from == u_raster) {
 		Transform tfm = transform_transpose(kernel_data.cam.rastertoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_screen) {
+	else if(from == u_screen) {
 		Transform tfm = transform_transpose(kernel_data.cam.screentoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (from == u_camera) {
+	else if(from == u_camera) {
 		Transform tfm = transform_transpose(kernel_data.cam.cameratoworld);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
@@ -332,22 +341,22 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 {
 	KernelGlobals *kg = kernel_globals;
 	
-	if (to == u_ndc) {
+	if(to == u_ndc) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtondc);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_raster) {
+	else if(to == u_raster) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtoraster);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_screen) {
+	else if(to == u_screen) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtoscreen);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
 	}
-	else if (to == u_camera) {
+	else if(to == u_camera) {
 		Transform tfm = transform_transpose(kernel_data.cam.worldtocamera);
 		COPY_MATRIX44(&result, &tfm);
 		return true;
@@ -365,8 +374,8 @@ bool OSLRenderServices::get_array_attribute(OSL::ShaderGlobals *sg, bool derivat
 
 static bool set_attribute_float3(float3 f[3], TypeDesc type, bool derivatives, void *val)
 {
-	if (type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
-	    type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor)
+	if(type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
+	   type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor)
 	{
 		float *fval = (float *)val;
 
@@ -374,7 +383,7 @@ static bool set_attribute_float3(float3 f[3], TypeDesc type, bool derivatives, v
 		fval[1] = f[0].y;
 		fval[2] = f[0].z;
 
-		if (derivatives) {
+		if(derivatives) {
 			fval[3] = f[1].x;
 			fval[4] = f[1].y;
 			fval[5] = f[1].z;
@@ -390,7 +399,7 @@ static bool set_attribute_float3(float3 f[3], TypeDesc type, bool derivatives, v
 		float *fval = (float *)val;
 		fval[0] = average(f[0]);
 
-		if (derivatives) {
+		if(derivatives) {
 			fval[1] = average(f[1]);
 			fval[2] = average(f[2]);
 		}
@@ -414,15 +423,15 @@ static bool set_attribute_float3(float3 f, TypeDesc type, bool derivatives, void
 
 static bool set_attribute_float(float f[3], TypeDesc type, bool derivatives, void *val)
 {
-	if (type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
-	    type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor)
+	if(type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
+	   type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor)
 	{
 		float *fval = (float *)val;
 		fval[0] = f[0];
 		fval[1] = f[1];
 		fval[2] = f[2];
 
-		if (derivatives) {
+		if(derivatives) {
 			fval[3] = f[1];
 			fval[4] = f[1];
 			fval[5] = f[1];
@@ -438,7 +447,7 @@ static bool set_attribute_float(float f[3], TypeDesc type, bool derivatives, voi
 		float *fval = (float *)val;
 		fval[0] = f[0];
 
-		if (derivatives) {
+		if(derivatives) {
 			fval[1] = f[1];
 			fval[2] = f[2];
 		}
@@ -466,7 +475,7 @@ static bool set_attribute_int(int i, TypeDesc type, bool derivatives, void *val)
 		int *ival = (int *)val;
 		ival[0] = i;
 
-		if (derivatives) {
+		if(derivatives) {
 			ival[1] = 0;
 			ival[2] = 0;
 		}
@@ -483,7 +492,7 @@ static bool set_attribute_string(ustring str, TypeDesc type, bool derivatives, v
 		ustring *sval = (ustring *)val;
 		sval[0] = str;
 
-		if (derivatives) {
+		if(derivatives) {
 			sval[1] = OSLRenderServices::u_empty;
 			sval[2] = OSLRenderServices::u_empty;
 		}
@@ -513,7 +522,7 @@ static bool set_attribute_float3_3(float3 P[3], TypeDesc type, bool derivatives,
 
 		if(type.arraylen > 3)
 			memset(fval + 3*3, 0, sizeof(float)*3*(type.arraylen - 3));
-		if (derivatives)
+		if(derivatives)
 			memset(fval + type.arraylen*3, 0, sizeof(float)*2*3*type.arraylen);
 
 		return true;
@@ -536,15 +545,15 @@ static bool set_attribute_matrix(const Transform& tfm, TypeDesc type, void *val)
 static bool get_mesh_element_attribute(KernelGlobals *kg, const ShaderData *sd, const OSLGlobals::Attribute& attr,
                                const TypeDesc& type, bool derivatives, void *val)
 {
-	if (attr.type == TypeDesc::TypePoint || attr.type == TypeDesc::TypeVector ||
-	    attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor)
+	if(attr.type == TypeDesc::TypePoint || attr.type == TypeDesc::TypeVector ||
+	   attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor)
 	{
 		float3 fval[3];
 		fval[0] = primitive_attribute_float3(kg, sd, attr.elem, attr.offset,
 		                                     (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
 		return set_attribute_float3(fval, type, derivatives, val);
 	}
-	else if (attr.type == TypeDesc::TypeFloat) {
+	else if(attr.type == TypeDesc::TypeFloat) {
 		float fval[3];
 		fval[0] = primitive_attribute_float(kg, sd, attr.elem, attr.offset,
 		                                    (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
@@ -558,7 +567,7 @@ static bool get_mesh_element_attribute(KernelGlobals *kg, const ShaderData *sd,
 static bool get_mesh_attribute(KernelGlobals *kg, const ShaderData *sd, const OSLGlobals::Attribute& attr,
                                const TypeDesc& type, bool derivatives, void *val)
 {
-	if (attr.type == TypeDesc::TypeMatrix) {
+	if(attr.type == TypeDesc::TypeMatrix) {
 		Transform tfm = primitive_attribute_matrix(kg, sd, attr.offset);
 		return set_attribute_matrix(tfm, type, val);
 	}
@@ -572,7 +581,7 @@ static void get_object_attribute(const OSLGlobals::Attribute& attr, bool derivat
 	size_t datasize = attr.value.datasize();
 
 	memcpy(val, attr.value.data(), datasize);
-	if (derivatives)
+	if(derivatives)
 		memset((char *)val + datasize, 0, datasize * 2);
 }
 
@@ -582,80 +591,80 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 	/* todo: turn this into hash table? */
 
 	/* Object Attributes */
-	if (name == u_object_location) {
+	if(name == u_object_location) {
 		float3 f = object_location(kg, sd);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
-	else if (name == u_object_index) {
+	else if(name == u_object_index) {
 		float f = object_pass_id(kg, sd->object);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_geom_dupli_generated) {
+	else if(name == u_geom_dupli_generated) {
 		float3 f = object_dupli_generated(kg, sd->object);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
-	else if (name == u_geom_dupli_uv) {
+	else if(name == u_geom_dupli_uv) {
 		float3 f = object_dupli_uv(kg, sd->object);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
-	else if (name == u_material_index) {
+	else if(name == u_material_index) {
 		float f = shader_pass_id(kg, sd);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_object_random) {
+	else if(name == u_object_random) {
 		float f = object_random_number(kg, sd->object);
 		return set_attribute_float(f, type, derivatives, val);
 	}
 
 	/* Particle Attributes */
-	else if (name == u_particle_index) {
+	else if(name == u_particle_index) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_index(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_particle_age) {
+	else if(name == u_particle_age) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_age(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_particle_lifetime) {
+	else if(name == u_particle_lifetime) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_lifetime(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_particle_location) {
+	else if(name == u_particle_location) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float3 f = particle_location(kg, particle_id);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
 #if 0	/* unsupported */
-	else if (name == u_particle_rotation) {
+	else if(name == u_particle_rotation) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float4 f = particle_rotation(kg, particle_id);
 		return set_attribute_float4(f, type, derivatives, val);
 	}
 #endif
-	else if (name == u_particle_size) {
+	else if(name == u_particle_size) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float f = particle_size(kg, particle_id);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_particle_velocity) {
+	else if(name == u_particle_velocity) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float3 f = particle_velocity(kg, particle_id);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
-	else if (name == u_particle_angular_velocity) {
+	else if(name == u_particle_angular_velocity) {
 		int particle_id = object_particle_id(kg, sd->object);
 		float3 f = particle_angular_velocity(kg, particle_id);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
 	
 	/* Geometry Attributes */
-	else if (name == u_geom_numpolyvertices) {
+	else if(name == u_geom_numpolyvertices) {
 		return set_attribute_int(3, type, derivatives, val);
 	}
-	else if ((name == u_geom_trianglevertices || name == u_geom_polyvertices)
+	else if((name == u_geom_trianglevertices || name == u_geom_polyvertices)
 #ifdef __HAIR__
 		     && sd->type & PRIMITIVE_ALL_TRIANGLE)
 #else
@@ -681,21 +690,21 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 		ustring object_name = kg->osl->object_names[sd->object];
 		return set_attribute_string(object_name, type, derivatives, val);
 	}
-	else if (name == u_is_smooth) {
+	else if(name == u_is_smooth) {
 		float f = ((sd->shader & SHADER_SMOOTH_NORMAL) != 0);
 		return set_attribute_float(f, type, derivatives, val);
 	}
 #ifdef __HAIR__
 	/* Hair Attributes */
-	else if (name == u_is_curve) {
+	else if(name == u_is_curve) {
 		float f = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_curve_thickness) {
+	else if(name == u_curve_thickness) {
 		float f = curve_thickness(kg, sd);
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_curve_tangent_normal) {
+	else if(name == u_curve_tangent_normal) {
 		float3 f = curve_tangent_normal(kg, sd);
 		return set_attribute_float3(f, type, derivatives, val);
 	}
@@ -707,22 +716,22 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *sd, ustring name,
                                                  TypeDesc type, bool derivatives, void *val)
 {
-	if (name == u_path_ray_length) {
+	if(name == u_path_ray_length) {
 		/* Ray Length */
 		float f = sd->ray_length;
 		return set_attribute_float(f, type, derivatives, val);
 	}
-	else if (name == u_path_ray_depth) {
+	else if(name == u_path_ray_depth) {
 		/* Ray Depth */
 		int f = sd->ray_depth;
 		return set_attribute_int(f, type, derivatives, val);
 	}
-	else if (name == u_path_transparent_depth) {
+	else if(name == u_path_transparent_depth) {
 		/* Transparent Ray Depth */
 		int f = sd->transparent_depth;
 		return set_attribute_int(f, type, derivatives, val);
 	}
-	else if (name == u_ndc) {
+	else if(name == u_ndc) {
 		/* NDC coordinates with special exception for otho */
 		OSLThreadData *tdata = kg->osl_tdata;
 		OSL::ShaderGlobals *globals = &tdata->globals;
@@ -754,7 +763,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
 bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name,
                                       TypeDesc type, ustring name, void *val)
 {
-	if (sg->renderstate == NULL)
+	if(sg->renderstate == NULL)
 		return false;
 
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
@@ -769,10 +778,10 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring
 	int object;
 
 	/* lookup of attribute on another object */
-	if (object_name != u_empty) {
+	if(object_name != u_empty) {
 		OSLGlobals::ObjectNameMap::iterator it = kg->osl->object_name_map.find(object_name);
 
-		if (it == kg->osl->object_name_map.end())
+		if(it == kg->osl->object_name_map.end())
 			return false;
 
 		object = it->second;
@@ -782,7 +791,7 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring
 		object = sd->object;
 		is_curve = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 
-		if (object == OBJECT_NONE)
+		if(object == OBJECT_NONE)
 			return get_background_attribute(kg, sd, name, type, derivatives, val);
 	}
 
@@ -791,10 +800,10 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring
 	OSLGlobals::AttributeMap& attribute_map = kg->osl->attribute_map[object];
 	OSLGlobals::AttributeMap::iterator it = attribute_map.find(name);
 
-	if (it != attribute_map.end()) {
+	if(it != attribute_map.end()) {
 		const OSLGlobals::Attribute& attr = it->second;
 
-		if (attr.elem != ATTR_ELEMENT_OBJECT) {
+		if(attr.elem != ATTR_ELEMENT_OBJECT) {
 			/* triangle and vertex attributes */
 			if(get_mesh_element_attribute(kg, sd, attr, type, derivatives, val))
 				return true;
@@ -811,7 +820,7 @@ bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring
 		/* not found in attribute, check standard object info */
 		bool is_std_object_attribute = get_object_standard_attribute(kg, sd, name, type, derivatives, val);
 
-		if (is_std_object_attribute)
+		if(is_std_object_attribute)
 			return true;
 
 		return get_background_attribute(kg, sd, name, type, derivatives, val);
@@ -834,7 +843,7 @@ bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, OSL::ShaderGlo
 bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
                                 OSL::ShaderGlobals *sg,
                                 float s, float t, float dsdx, float dtdx,
-                                float dsdy, float dtdy, float *result)
+                                float dsdy, float dtdy, int nchannels, float *result)
 {
 	OSL::TextureSystem *ts = osl_ts;
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
@@ -869,9 +878,9 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
 		PtexFilter::Options opts(PtexFilter::f_bicubic, mipmaplerp, sharpness);
 		PtexPtr<PtexFilter> f(PtexFilter::getFilter(r, opts));
 
-		f->eval(result, options.firstchannel, options.nchannels, faceid, u, v, dudx, dvdx, dudy, dvdy);
+		f->eval(result, options.firstchannel, nchannels, faceid, u, v, dudx, dvdx, dudy, dvdy);
 
-		for(int c = r->numChannels(); c < options.nchannels; c++)
+		for(int c = r->numChannels(); c < nchannels; c++)
 			result[c] = result[0];
 
 		return true;
@@ -879,16 +888,16 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
 #endif
 	bool status;
 
-	if(filename[0] == '@' && filename.find('.') == -1) {
-        int slot = atoi(filename.c_str() + 1);
+	if(filename[0] == '@') {
+		int slot = atoi(filename.c_str() + 1);
 		float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t);
 
 		result[0] = rgba[0];
-		if(options.nchannels > 1)
+		if(nchannels > 1)
 			result[1] = rgba[1];
-		if(options.nchannels > 2)
+		if(nchannels > 2)
 			result[2] = rgba[2];
-		if(options.nchannels > 3)
+		if(nchannels > 3)
 			result[3] = rgba[3];
 		status = true;
 	}
@@ -898,17 +907,24 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
 
 		OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info);
 
+#if OIIO_VERSION < 10500
 		status = ts->texture(th, thread_info,
-		                     options, s, t, dsdx, dtdx, dsdy, dtdy, result);
+		                     options, s, t, dsdx, dtdx, dsdy, dtdy,
+		                     result);
+#else
+		status = ts->texture(th, thread_info,
+		                     options, s, t, dsdx, dtdx, dsdy, dtdy,
+		                     nchannels, result);
+#endif
 	}
 
 	if(!status) {
-		if(options.nchannels == 3 || options.nchannels == 4) {
+		if(nchannels == 3 || nchannels == 4) {
 			result[0] = 1.0f;
 			result[1] = 0.0f;
 			result[2] = 1.0f;
 
-			if(options.nchannels == 4)
+			if(nchannels == 4)
 				result[3] = 1.0f;
 		}
 	}
@@ -919,26 +935,46 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
 bool OSLRenderServices::texture3d(ustring filename, TextureOpt &options,
                                   OSL::ShaderGlobals *sg, const OSL::Vec3 &P,
                                   const OSL::Vec3 &dPdx, const OSL::Vec3 &dPdy,
-                                  const OSL::Vec3 &dPdz, float *result)
+                                  const OSL::Vec3 &dPdz, int nchannels, float *result)
 {
 	OSL::TextureSystem *ts = osl_ts;
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
 	KernelGlobals *kg = sd->osl_globals;
-	OSLThreadData *tdata = kg->osl_tdata;
-	OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info;
-
-	OIIO::TextureSystem::TextureHandle *th =  ts->get_texture_handle(filename, thread_info);
+	bool status;
+	if(filename[0] == '@') {
+		int slot = atoi(filename.c_str() + 1);
+		float4 rgba = kernel_tex_image_interp_3d(slot, P.x, P.y, P.z);
 
-	bool status = ts->texture3d(th, thread_info,
-	                            options, P, dPdx, dPdy, dPdz, result);
+		result[0] = rgba[0];
+		if(nchannels > 1)
+			result[1] = rgba[1];
+		if(nchannels > 2)
+			result[2] = rgba[2];
+		if(nchannels > 3)
+			result[3] = rgba[3];
+		status = true;
+	}
+	else {
+		OSLThreadData *tdata = kg->osl_tdata;
+		OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info;
+		OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info);
+#if OIIO_VERSION < 10500
+		status = ts->texture3d(th, thread_info,
+		                       options, P, dPdx, dPdy, dPdz, result);
+#else
+		status = ts->texture3d(th, thread_info,
+		                       options, P, dPdx, dPdy, dPdz,
+		                       nchannels, result);
+#endif
+	}
 
 	if(!status) {
-		if(options.nchannels == 3 || options.nchannels == 4) {
+		if(nchannels == 3 || nchannels == 4) {
 			result[0] = 1.0f;
 			result[1] = 0.0f;
 			result[2] = 1.0f;
 
-			if(options.nchannels == 4)
+			if(nchannels == 4)
 				result[3] = 1.0f;
 		}
 
@@ -949,7 +985,8 @@ bool OSLRenderServices::texture3d(ustring filename, TextureOpt &options,
 
 bool OSLRenderServices::environment(ustring filename, TextureOpt &options,
                                     OSL::ShaderGlobals *sg, const OSL::Vec3 &R,
-                                    const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy, float *result)
+                                    const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy,
+                                    int nchannels, float *result)
 {
 	OSL::TextureSystem *ts = osl_ts;
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
@@ -957,17 +994,24 @@ bool OSLRenderServices::environment(ustring filename, TextureOpt &options,
 	OSLThreadData *tdata = kg->osl_tdata;
 	OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info;
 
-	OIIO::TextureSystem::TextureHandle *th =  ts->get_texture_handle(filename, thread_info);
+	OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info);
+
+#if OIIO_VERSION < 10500
 	bool status = ts->environment(th, thread_info,
 	                              options, R, dRdx, dRdy, result);
+#else
+	bool status = ts->environment(th, thread_info,
+	                              options, R, dRdx, dRdy,
+	                              nchannels, result);
+#endif
 
 	if(!status) {
-		if(options.nchannels == 3 || options.nchannels == 4) {
+		if(nchannels == 3 || nchannels == 4) {
 			result[0] = 1.0f;
 			result[1] = 0.0f;
 			result[2] = 1.0f;
 
-			if(options.nchannels == 4)
+			if(nchannels == 4)
 				result[3] = 1.0f;
 		}
 	}
@@ -1018,7 +1062,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
 
 	ray.P = TO_FLOAT3(P);
 	ray.D = TO_FLOAT3(R);
-	ray.t = (options.maxdist == 1.0e30)? FLT_MAX: options.maxdist - options.mindist;
+	ray.t = (options.maxdist == 1.0e30f)? FLT_MAX: options.maxdist - options.mindist;
 	ray.time = sd->time;
 
 	if(options.mindist == 0.0f) {
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 6f928a0d103..cb6f2311ad8 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __OSL_SERVICES_H__
@@ -97,16 +97,64 @@ public:
 	bool texture(ustring filename, TextureOpt &options,
 	             OSL::ShaderGlobals *sg,
 	             float s, float t, float dsdx, float dtdx,
-	             float dsdy, float dtdy, float *result);
+	             float dsdy, float dtdy, int nchannels, float *result);
 
 	bool texture3d(ustring filename, TextureOpt &options,
 	               OSL::ShaderGlobals *sg, const OSL::Vec3 &P,
 	               const OSL::Vec3 &dPdx, const OSL::Vec3 &dPdy,
-	               const OSL::Vec3 &dPdz, float *result);
+	               const OSL::Vec3 &dPdz, int nchannels, float *result);
+
+#if OSL_LIBRARY_VERSION_CODE >= 10700
+	bool texture(ustring filename,
+	             TextureHandle * /*texture_handle*/,
+	             TexturePerthread * /*texture_thread_info*/,
+	             TextureOpt &options,
+	             OSL::ShaderGlobals *sg,
+	             float s, float t,
+	             float dsdx, float dtdx, float dsdy, float dtdy,
+	             int nchannels,
+	             float *result,
+	             float * /*dresultds*/,
+	             float * /*dresultdt*/)
+	{
+		return texture(filename,
+		               options,
+		               sg,
+		               s, t,
+		               dsdx, dtdx, dsdy, dtdy,
+		               nchannels,
+		               result);
+	}
+
+	bool texture3d(ustring filename,
+	               TextureHandle * /*texture_handle*/,
+	               TexturePerthread * /*texture_thread_info*/,
+	               TextureOpt &options,
+	               OSL::ShaderGlobals *sg,
+	               const OSL::Vec3 &P,
+	               const OSL::Vec3 &dPdx,
+	               const OSL::Vec3 &dPdy,
+	               const OSL::Vec3 &dPdz,
+	               int nchannels,
+	               float *result,
+	               float * /*dresultds*/,
+	               float * /*dresultdt*/,
+	               float * /*dresultdr*/)
+	{
+		return texture3d(filename,
+		                 options,
+		                 sg,
+		                 P,
+		                 dPdx, dPdy, dPdz,
+		                 nchannels,
+		                 result);
+	}
+#endif
 
 	bool environment(ustring filename, TextureOpt &options,
 	                 OSL::ShaderGlobals *sg, const OSL::Vec3 &R,
-	                 const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy, float *result);
+	                 const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy,
+	                 int nchannels, float *result);
 
 	bool get_texture_info(OSL::ShaderGlobals *sg, ustring filename, int subimage,
 	                      ustring dataname, TypeDesc datatype, void *data);
@@ -159,70 +207,37 @@ public:
 	static ustring u_v;
 	static ustring u_empty;
 
-#if OSL_LIBRARY_VERSION_CODE < 10500
-	bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) {
-		return get_matrix(NULL, result, xform, time);
-	}
-
-	bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) {
-		return get_inverse_matrix(NULL, result, xform, time);
-	}
-
-	bool get_matrix(OSL::Matrix44 &result, ustring from, float time) {
-		return get_matrix(NULL, result, from, time);
-	}
-
-	bool get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time) {
-		return get_inverse_matrix(NULL, result, to, time);
-	}
-
-	bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) {
-		return get_matrix(NULL, result, xform);
-	}
-
-	bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) {
-		return get_inverse_matrix(NULL, result, xform);
-	}
-
-	bool get_matrix(OSL::Matrix44 &result, ustring from) {
-		return get_matrix(NULL, result, from);
-	}
-
-	bool get_inverse_matrix(OSL::Matrix44 &result, ustring to) {
-		return get_inverse_matrix(NULL, result, to);
-	}
+	/* Code to make OSL versions transition smooth. */
 
-	bool get_array_attribute(void *renderstate, bool derivatives,
-	                         ustring object, TypeDesc type, ustring name,
-	                         int index, void *val) {
-		OSL::ShaderGlobals sg;
-		sg.renderstate = renderstate;
-		return get_array_attribute(&sg, derivatives,
-		                           object, type, name,
-		                           index, val);
-	}
-
-	bool get_attribute(void *renderstate, bool derivatives, ustring object_name,
-	                   TypeDesc type, ustring name, void *val) {
-		OSL::ShaderGlobals sg;
-		sg.renderstate = renderstate;
-		return get_attribute(&sg, derivatives, object_name, type, name, val);
+#if OSL_LIBRARY_VERSION_CODE < 10600
+	inline bool texture(ustring filename, TextureOpt &options,
+	                    OSL::ShaderGlobals *sg,
+	                    float s, float t, float dsdx, float dtdx,
+	                    float dsdy, float dtdy, float *result)
+	{
+		return texture(filename, options, sg, s, t, dsdx, dtdx, dsdy, dtdy,
+		               options.nchannels, result);
 	}
 
-	bool has_userdata(ustring name, TypeDesc type, void *renderstate) {
-		return has_userdata(name, type, (OSL::ShaderGlobals *) renderstate);
+	inline bool texture3d(ustring filename, TextureOpt &options,
+	                      OSL::ShaderGlobals *sg, const OSL::Vec3 &P,
+	                      const OSL::Vec3 &dPdx, const OSL::Vec3 &dPdy,
+	                      const OSL::Vec3 &dPdz, float *result)
+	{
+		return texture3d(filename, options, sg, P, dPdx, dPdy, dPdz,
+		                 options.nchannels, result);
 	}
 
-	bool get_userdata(bool derivatives, ustring name, TypeDesc type,
-	                  void *renderstate, void *val) {
-		return get_userdata(derivatives, name, type, (OSL::ShaderGlobals *) renderstate, val);
-	}
-
-	bool get_texture_info(ustring filename, int subimage,
-	                      ustring dataname, TypeDesc datatype, void *data) {
-		return get_texture_info(NULL, filename, subimage, dataname, datatype, data);
+	inline bool environment(ustring filename, TextureOpt &options,
+	                        OSL::ShaderGlobals *sg, const OSL::Vec3 &R,
+	                        const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy,
+	                        float *result)
+	{
+		return environment(filename, options, sg, R, dRdx, dRdy,
+		                   options.nchannels, result);
 	}
 #endif
+
 private:
 	KernelGlobals *kernel_globals;
 	OSL::TextureSystem *osl_ts;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 48498116874..2f234aa25ea 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -11,9 +11,11 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
+#include <OSL/oslexec.h>
+
 #include "kernel_compat_cpu.h"
 #include "kernel_montecarlo.h"
 #include "kernel_types.h"
@@ -34,7 +36,6 @@
 
 #include "attribute.h"
 
-#include <OSL/oslexec.h>
 
 CCL_NAMESPACE_BEGIN
 
@@ -145,162 +146,175 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 	/* OSL gives us a closure tree, we flatten it into arrays per
 	 * closure type, for evaluation, sampling, etc later on. */
 
-	if (closure->type == OSL::ClosureColor::COMPONENT) {
-		OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
-		CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
+#if OSL_LIBRARY_VERSION_CODE < 10700
+	switch(closure->type) {
+#else
+	switch(closure->id) {
+#endif
+		case OSL::ClosureColor::MUL: {
+			OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
+			flatten_surface_closure_tree(sd, path_flag, mul->closure, TO_FLOAT3(mul->weight) * weight);
+			break;
+		}
+		case OSL::ClosureColor::ADD: {
+			OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
+			flatten_surface_closure_tree(sd, path_flag, add->closureA, weight);
+			flatten_surface_closure_tree(sd, path_flag, add->closureB, weight);
+			break;
+		}
+		default: {
+			OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
+			CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
 
-		if (prim) {
-			ShaderClosure sc;
+			if(prim) {
+				ShaderClosure sc;
 
 #ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
-			weight = weight*TO_FLOAT3(comp->w);
+				weight = weight*TO_FLOAT3(comp->w);
 #endif
-			sc.weight = weight;
+				sc.weight = weight;
 
-			prim->setup();
+				prim->setup();
 
-			switch (prim->category) {
-				case CClosurePrimitive::BSDF: {
-					CBSDFClosure *bsdf = (CBSDFClosure *)prim;
-					int scattering = bsdf->scattering();
+				switch(prim->category) {
+					case CClosurePrimitive::BSDF: {
+						CBSDFClosure *bsdf = (CBSDFClosure *)prim;
+						int scattering = bsdf->scattering();
 
-					/* caustic options */
-					if((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
-						KernelGlobals *kg = sd->osl_globals;
+						/* caustic options */
+						if((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
+							KernelGlobals *kg = sd->osl_globals;
 
-						if((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
-						   (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) {
-							return;
+							if((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
+							   (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT)))
+							{
+								return;
+							}
 						}
-					}
 
-					/* sample weight */
-					float sample_weight = fabsf(average(weight));
+						/* sample weight */
+						float sample_weight = fabsf(average(weight));
 
-					sc.sample_weight = sample_weight;
+						sc.sample_weight = sample_weight;
 
-					sc.type = bsdf->sc.type;
-					sc.N = bsdf->sc.N;
-					sc.T = bsdf->sc.T;
-					sc.data0 = bsdf->sc.data0;
-					sc.data1 = bsdf->sc.data1;
-					sc.data2 = bsdf->sc.data2;
-					sc.prim = bsdf->sc.prim;
+						sc.type = bsdf->sc.type;
+						sc.N = bsdf->sc.N;
+						sc.T = bsdf->sc.T;
+						sc.data0 = bsdf->sc.data0;
+						sc.data1 = bsdf->sc.data1;
+						sc.data2 = bsdf->sc.data2;
+						sc.prim = bsdf->sc.prim;
 
-					/* add */
-					if(sc.sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
-						sd->closure[sd->num_closure++] = sc;
-						sd->flag |= bsdf->shaderdata_flag();
-					}
-					break;
-				}
-				case CClosurePrimitive::Emissive: {
-					/* sample weight */
-					float sample_weight = fabsf(average(weight));
-
-					sc.sample_weight = sample_weight;
-					sc.type = CLOSURE_EMISSION_ID;
-					sc.data0 = 0.0f;
-					sc.data1 = 0.0f;
-					sc.data2 = 0.0f;
-					sc.prim = NULL;
-
-					/* flag */
-					if(sd->num_closure < MAX_CLOSURE) {
-						sd->closure[sd->num_closure++] = sc;
-						sd->flag |= SD_EMISSION;
-					}
-					break;
-				}
-				case CClosurePrimitive::AmbientOcclusion: {
-					/* sample weight */
-					float sample_weight = fabsf(average(weight));
-
-					sc.sample_weight = sample_weight;
-					sc.type = CLOSURE_AMBIENT_OCCLUSION_ID;
-					sc.data0 = 0.0f;
-					sc.data1 = 0.0f;
-					sc.data2 = 0.0f;
-					sc.prim = NULL;
-
-					if(sd->num_closure < MAX_CLOSURE) {
-						sd->closure[sd->num_closure++] = sc;
-						sd->flag |= SD_AO;
-					}
-					break;
-				}
-				case CClosurePrimitive::Holdout: {
-					sc.sample_weight = 0.0f;
-					sc.type = CLOSURE_HOLDOUT_ID;
-					sc.data0 = 0.0f;
-					sc.data1 = 0.0f;
-					sc.data2 = 0.0f;
-					sc.prim = NULL;
-
-					if(sd->num_closure < MAX_CLOSURE) {
-						sd->closure[sd->num_closure++] = sc;
-						sd->flag |= SD_HOLDOUT;
+						/* add */
+						if(sc.sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
+							sd->closure[sd->num_closure++] = sc;
+							sd->flag |= bsdf->shaderdata_flag();
+						}
+						break;
 					}
-					break;
-				}
-				case CClosurePrimitive::BSSRDF: {
-					CBSSRDFClosure *bssrdf = (CBSSRDFClosure *)prim;
-					float sample_weight = fabsf(average(weight));
+					case CClosurePrimitive::Emissive: {
+						/* sample weight */
+						float sample_weight = fabsf(average(weight));
 
-					if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure+2 < MAX_CLOSURE) {
 						sc.sample_weight = sample_weight;
-
-						sc.type = bssrdf->sc.type;
-						sc.N = bssrdf->sc.N;
-						sc.data1 = bssrdf->sc.data1;
-						sc.T.x = bssrdf->sc.T.x;
+						sc.type = CLOSURE_EMISSION_ID;
+						sc.data0 = 0.0f;
+						sc.data1 = 0.0f;
+						sc.data2 = 0.0f;
 						sc.prim = NULL;
 
-						/* disable in case of diffuse ancestor, can't see it well then and
-						 * adds considerably noise due to probabilities of continuing path
-						 * getting lower and lower */
-						if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
-							bssrdf->radius = make_float3(0.0f, 0.0f, 0.0f);
-
-						/* create one closure for each color channel */
-						if(fabsf(weight.x) > 0.0f) {
-							sc.weight = make_float3(weight.x, 0.0f, 0.0f);
-							sc.data0 = bssrdf->radius.x;
-							sd->flag |= bssrdf_setup(&sc, sc.type);
+						/* flag */
+						if(sd->num_closure < MAX_CLOSURE) {
 							sd->closure[sd->num_closure++] = sc;
+							sd->flag |= SD_EMISSION;
 						}
+						break;
+					}
+					case CClosurePrimitive::AmbientOcclusion: {
+						/* sample weight */
+						float sample_weight = fabsf(average(weight));
+
+						sc.sample_weight = sample_weight;
+						sc.type = CLOSURE_AMBIENT_OCCLUSION_ID;
+						sc.data0 = 0.0f;
+						sc.data1 = 0.0f;
+						sc.data2 = 0.0f;
+						sc.prim = NULL;
 
-						if(fabsf(weight.y) > 0.0f) {
-							sc.weight = make_float3(0.0f, weight.y, 0.0f);
-							sc.data0 = bssrdf->radius.y;
-							sd->flag |= bssrdf_setup(&sc, sc.type);
+						if(sd->num_closure < MAX_CLOSURE) {
 							sd->closure[sd->num_closure++] = sc;
+							sd->flag |= SD_AO;
 						}
+						break;
+					}
+					case CClosurePrimitive::Holdout: {
+						sc.sample_weight = 0.0f;
+						sc.type = CLOSURE_HOLDOUT_ID;
+						sc.data0 = 0.0f;
+						sc.data1 = 0.0f;
+						sc.data2 = 0.0f;
+						sc.prim = NULL;
 
-						if(fabsf(weight.z) > 0.0f) {
-							sc.weight = make_float3(0.0f, 0.0f, weight.z);
-							sc.data0 = bssrdf->radius.z;
-							sd->flag |= bssrdf_setup(&sc, sc.type);
+						if(sd->num_closure < MAX_CLOSURE) {
 							sd->closure[sd->num_closure++] = sc;
+							sd->flag |= SD_HOLDOUT;
+						}
+						break;
+					}
+					case CClosurePrimitive::BSSRDF: {
+						CBSSRDFClosure *bssrdf = (CBSSRDFClosure *)prim;
+						float sample_weight = fabsf(average(weight));
+
+						if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure+2 < MAX_CLOSURE) {
+							sc.sample_weight = sample_weight;
+
+							sc.type = bssrdf->sc.type;
+							sc.N = bssrdf->sc.N;
+							sc.data1 = bssrdf->sc.data1;
+							sc.T.x = bssrdf->sc.T.x;
+							sc.prim = NULL;
+
+							/* disable in case of diffuse ancestor, can't see it well then and
+							 * adds considerably noise due to probabilities of continuing path
+							 * getting lower and lower */
+							if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
+								bssrdf->radius = make_float3(0.0f, 0.0f, 0.0f);
+
+							/* create one closure for each color channel */
+							if(fabsf(weight.x) > 0.0f) {
+								sc.weight = make_float3(weight.x, 0.0f, 0.0f);
+								sc.data0 = bssrdf->radius.x;
+								sc.data1 = 0.0f;
+								sd->flag |= bssrdf_setup(&sc, sc.type);
+								sd->closure[sd->num_closure++] = sc;
+							}
+
+							if(fabsf(weight.y) > 0.0f) {
+								sc.weight = make_float3(0.0f, weight.y, 0.0f);
+								sc.data0 = bssrdf->radius.y;
+								sc.data1 = 0.0f;
+								sd->flag |= bssrdf_setup(&sc, sc.type);
+								sd->closure[sd->num_closure++] = sc;
+							}
+
+							if(fabsf(weight.z) > 0.0f) {
+								sc.weight = make_float3(0.0f, 0.0f, weight.z);
+								sc.data0 = bssrdf->radius.z;
+								sc.data1 = 0.0f;
+								sd->flag |= bssrdf_setup(&sc, sc.type);
+								sd->closure[sd->num_closure++] = sc;
+							}
 						}
+						break;
 					}
-					break;
+					case CClosurePrimitive::Background:
+					case CClosurePrimitive::Volume:
+						break; /* not relevant */
 				}
-				case CClosurePrimitive::Background:
-				case CClosurePrimitive::Volume:
-					break; /* not relevant */
 			}
+			break;
 		}
 	}
-	else if (closure->type == OSL::ClosureColor::MUL) {
-		OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
-		flatten_surface_closure_tree(sd, path_flag, mul->closure, TO_FLOAT3(mul->weight) * weight);
-	}
-	else if (closure->type == OSL::ClosureColor::ADD) {
-		OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
-		flatten_surface_closure_tree(sd, path_flag, add->closureA, weight);
-		flatten_surface_closure_tree(sd, path_flag, add->closureB, weight);
-	}
 }
 
 void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, int path_flag, ShaderContext ctx)
@@ -315,11 +329,11 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, int path_flag, S
 	OSL::ShadingContext *octx = tdata->context[(int)ctx];
 	int shader = sd->shader & SHADER_MASK;
 
-	if (kg->osl->surface_state[shader])
+	if(kg->osl->surface_state[shader])
 		ss->execute(*octx, *(kg->osl->surface_state[shader]), *globals);
 
 	/* flatten closure tree */
-	if (globals->Ci)
+	if(globals->Ci)
 		flatten_surface_closure_tree(sd, path_flag, globals->Ci);
 }
 
@@ -331,27 +345,33 @@ static float3 flatten_background_closure_tree(const OSL::ClosureColor *closure)
 	 * is only one supported closure type at the moment, which has no evaluation
 	 * functions, so we just sum the weights */
 
-	if (closure->type == OSL::ClosureColor::COMPONENT) {
-		OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
-		CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
-
-		if (prim && prim->category == CClosurePrimitive::Background)
-#ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
-			return TO_FLOAT3(comp->w);
+#if OSL_LIBRARY_VERSION_CODE < 10700
+	switch(closure->type) {
 #else
-			return make_float3(1.0f, 1.0f, 1.0f);
+	switch(closure->id) {
 #endif
-	}
-	else if (closure->type == OSL::ClosureColor::MUL) {
-		OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
+		case OSL::ClosureColor::MUL: {
+			OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
 
-		return TO_FLOAT3(mul->weight) * flatten_background_closure_tree(mul->closure);
-	}
-	else if (closure->type == OSL::ClosureColor::ADD) {
-		OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
+			return TO_FLOAT3(mul->weight) * flatten_background_closure_tree(mul->closure);
+		}
+		case OSL::ClosureColor::ADD: {
+			OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
 
-		return flatten_background_closure_tree(add->closureA) +
-		       flatten_background_closure_tree(add->closureB);
+			return flatten_background_closure_tree(add->closureA) +
+			       flatten_background_closure_tree(add->closureB);
+		}
+		default: {
+			OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
+			CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
+
+			if(prim && prim->category == CClosurePrimitive::Background)
+#ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
+				return TO_FLOAT3(comp->w);
+#else
+				return make_float3(1.0f, 1.0f, 1.0f);
+#endif
+		}
 	}
 
 	return make_float3(0.0f, 0.0f, 0.0f);
@@ -368,11 +388,11 @@ float3 OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, int path_fl
 	OSL::ShaderGlobals *globals = &tdata->globals;
 	OSL::ShadingContext *octx = tdata->context[(int)ctx];
 
-	if (kg->osl->background_state)
+	if(kg->osl->background_state)
 		ss->execute(*octx, *(kg->osl->background_state), *globals);
 
 	/* return background color immediately */
-	if (globals->Ci)
+	if(globals->Ci)
 		return flatten_background_closure_tree(globals->Ci);
 
 	return make_float3(0.0f, 0.0f, 0.0f);
@@ -386,76 +406,84 @@ static void flatten_volume_closure_tree(ShaderData *sd,
 	/* OSL gives us a closure tree, we flatten it into arrays per
 	 * closure type, for evaluation, sampling, etc later on. */
 
-	if (closure->type == OSL::ClosureColor::COMPONENT) {
-		OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
-		CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
+#if OSL_LIBRARY_VERSION_CODE < 10700
+	switch(closure->type) {
+#else
+	switch(closure->id) {
+#endif
+		case OSL::ClosureColor::MUL: {
+			OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
+			flatten_volume_closure_tree(sd, mul->closure, TO_FLOAT3(mul->weight) * weight);
+			break;
+		}
+		case OSL::ClosureColor::ADD: {
+			OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
+			flatten_volume_closure_tree(sd, add->closureA, weight);
+			flatten_volume_closure_tree(sd, add->closureB, weight);
+			break;
+		}
+		default: {
+			OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
+			CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
 
-		if (prim) {
-			ShaderClosure sc;
+			if(prim) {
+				ShaderClosure sc;
 
 #ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
-			weight = weight*TO_FLOAT3(comp->w);
+				weight = weight*TO_FLOAT3(comp->w);
 #endif
-			sc.weight = weight;
-
-			prim->setup();
-
-			switch (prim->category) {
-				case CClosurePrimitive::Volume: {
-					CVolumeClosure *volume = (CVolumeClosure *)prim;
-					/* sample weight */
-					float sample_weight = fabsf(average(weight));
-
-					sc.sample_weight = sample_weight;
-					sc.type = volume->sc.type;
-					sc.data0 = volume->sc.data0;
-					sc.data1 = volume->sc.data1;
-
-					/* add */
-					if((sc.sample_weight > CLOSURE_WEIGHT_CUTOFF) &&
-					   (sd->num_closure < MAX_CLOSURE))
-					{
-						sd->closure[sd->num_closure++] = sc;
-						sd->flag |= volume->shaderdata_flag();
+				sc.weight = weight;
+
+				prim->setup();
+
+				switch(prim->category) {
+					case CClosurePrimitive::Volume: {
+						CVolumeClosure *volume = (CVolumeClosure *)prim;
+						/* sample weight */
+						float sample_weight = fabsf(average(weight));
+
+						sc.sample_weight = sample_weight;
+						sc.type = volume->sc.type;
+						sc.data0 = volume->sc.data0;
+						sc.data1 = volume->sc.data1;
+
+						/* add */
+						if((sc.sample_weight > CLOSURE_WEIGHT_CUTOFF) &&
+						   (sd->num_closure < MAX_CLOSURE))
+						{
+							sd->closure[sd->num_closure++] = sc;
+							sd->flag |= volume->shaderdata_flag();
+						}
+						break;
 					}
-					break;
-				}
-				case CClosurePrimitive::Emissive: {
-					/* sample weight */
-					float sample_weight = fabsf(average(weight));
-
-					sc.sample_weight = sample_weight;
-					sc.type = CLOSURE_EMISSION_ID;
-					sc.data0 = 0.0f;
-					sc.data1 = 0.0f;
-					sc.prim = NULL;
-
-					/* flag */
-					if(sd->num_closure < MAX_CLOSURE) {
-						sd->closure[sd->num_closure++] = sc;
-						sd->flag |= SD_EMISSION;
+					case CClosurePrimitive::Emissive: {
+						/* sample weight */
+						float sample_weight = fabsf(average(weight));
+
+						sc.sample_weight = sample_weight;
+						sc.type = CLOSURE_EMISSION_ID;
+						sc.data0 = 0.0f;
+						sc.data1 = 0.0f;
+						sc.prim = NULL;
+
+						/* flag */
+						if(sd->num_closure < MAX_CLOSURE) {
+							sd->closure[sd->num_closure++] = sc;
+							sd->flag |= SD_EMISSION;
+						}
+						break;
 					}
-					break;
+					case CClosurePrimitive::Holdout:
+						break; /* not implemented */
+					case CClosurePrimitive::Background:
+					case CClosurePrimitive::BSDF:
+					case CClosurePrimitive::BSSRDF:
+					case CClosurePrimitive::AmbientOcclusion:
+						break; /* not relevant */
 				}
-				case CClosurePrimitive::Holdout:
-					break; /* not implemented */
-				case CClosurePrimitive::Background:
-				case CClosurePrimitive::BSDF:
-				case CClosurePrimitive::BSSRDF:
-				case CClosurePrimitive::AmbientOcclusion:
-					break; /* not relevant */
 			}
 		}
 	}
-	else if (closure->type == OSL::ClosureColor::MUL) {
-		OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
-		flatten_volume_closure_tree(sd, mul->closure, TO_FLOAT3(mul->weight) * weight);
-	}
-	else if (closure->type == OSL::ClosureColor::ADD) {
-		OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
-		flatten_volume_closure_tree(sd, add->closureA, weight);
-		flatten_volume_closure_tree(sd, add->closureB, weight);
-	}
 }
 
 void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, int path_flag, ShaderContext ctx)
@@ -470,11 +498,11 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, int path_flag, Sh
 	OSL::ShadingContext *octx = tdata->context[(int)ctx];
 	int shader = sd->shader & SHADER_MASK;
 
-	if (kg->osl->volume_state[shader])
+	if(kg->osl->volume_state[shader])
 		ss->execute(*octx, *(kg->osl->volume_state[shader]), *globals);
 	
 	/* flatten closure tree */
-	if (globals->Ci)
+	if(globals->Ci)
 		flatten_volume_closure_tree(sd, globals->Ci);
 }
 
@@ -492,7 +520,7 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderConte
 	OSL::ShadingContext *octx = tdata->context[(int)ctx];
 	int shader = sd->shader & SHADER_MASK;
 
-	if (kg->osl->displacement_state[shader])
+	if(kg->osl->displacement_state[shader])
 		ss->execute(*octx, *(kg->osl->displacement_state[shader]), *globals);
 
 	/* get back position */
@@ -519,7 +547,7 @@ float3 OSLShader::bsdf_eval(const ShaderData *sd, const ShaderClosure *sc, const
 	CBSDFClosure *bsdf = (CBSDFClosure *)sc->prim;
 	float3 bsdf_eval;
 
-	if (dot(sd->Ng, omega_in) >= 0.0f)
+	if(dot(sd->Ng, omega_in) >= 0.0f)
 		bsdf_eval = bsdf->eval_reflect(sd->I, omega_in, pdf);
 	else
 		bsdf_eval = bsdf->eval_transmit(sd->I, omega_in, pdf);
@@ -547,7 +575,7 @@ int OSLShader::find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id,
 	ustring stdname(std::string("geom:") + std::string(Attribute::standard_name((AttributeStandard)id)));
 	OSLGlobals::AttributeMap::const_iterator it = attr_map.find(stdname);
 
-	if (it != attr_map.end()) {
+	if(it != attr_map.end()) {
 		const OSLGlobals::Attribute &osl_attr = it->second;
 		*elem = osl_attr.elem;
 
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index 40f50d8b0e9..15dd74f9d38 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __OSL_SHADER_H__
diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt
index 0b735ede701..81931463cad 100644
--- a/intern/cycles/kernel/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/shaders/CMakeLists.txt
@@ -74,6 +74,7 @@ set(SRC_OSL
 	node_vector_transform.osl
 	node_velvet_bsdf.osl
 	node_voronoi_texture.osl
+	node_voxel_texture.osl
 	node_wavelength.osl
 	node_blackbody.osl
 	node_wave_texture.osl
diff --git a/intern/cycles/kernel/shaders/node_absorption_volume.osl b/intern/cycles/kernel/shaders/node_absorption_volume.osl
index 6bac83ba4f5..18f662ebbbd 100644
--- a/intern/cycles/kernel/shaders/node_absorption_volume.osl
+++ b/intern/cycles/kernel/shaders/node_absorption_volume.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_add_closure.osl b/intern/cycles/kernel/shaders/node_add_closure.osl
index b826fb22784..b6596e0b6bd 100644
--- a/intern/cycles/kernel/shaders/node_add_closure.osl
+++ b/intern/cycles/kernel/shaders/node_add_closure.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_ambient_occlusion.osl b/intern/cycles/kernel/shaders/node_ambient_occlusion.osl
index 961aed1016b..5f056122bbe 100644
--- a/intern/cycles/kernel/shaders/node_ambient_occlusion.osl
+++ b/intern/cycles/kernel/shaders/node_ambient_occlusion.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
index da1e4f77107..281ed4e8726 100644
--- a/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_attribute.osl b/intern/cycles/kernel/shaders/node_attribute.osl
index 43f69fab053..67183e9ffe0 100644
--- a/intern/cycles/kernel/shaders/node_attribute.osl
+++ b/intern/cycles/kernel/shaders/node_attribute.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_background.osl b/intern/cycles/kernel/shaders/node_background.osl
index c4379a8f71b..613d4e360fa 100644
--- a/intern/cycles/kernel/shaders/node_background.osl
+++ b/intern/cycles/kernel/shaders/node_background.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_blackbody.osl b/intern/cycles/kernel/shaders/node_blackbody.osl
index d26e56ab06d..1da6894d0f0 100644
--- a/intern/cycles/kernel/shaders/node_blackbody.osl
+++ b/intern/cycles/kernel/shaders/node_blackbody.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_brick_texture.osl b/intern/cycles/kernel/shaders/node_brick_texture.osl
index 70a6a6ea7ce..35e01178ba8 100644
--- a/intern/cycles/kernel/shaders/node_brick_texture.osl
+++ b/intern/cycles/kernel/shaders/node_brick_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -22,6 +22,7 @@
 float brick_noise(int n) /* fast integer noise */
 {
 	int nn;
+	n = (n + 1013) & 2147483647;
 	n = (n >> 13) ^ n;
 	nn = (n * (n * n * 60493 + 19990303) + 1376312589) & 2147483647;
 	return 0.5 * ((float)nn / 1073741824.0);
@@ -87,12 +88,9 @@ shader node_brick_texture(
 		
 	if (Fac != 1.0) {
 		float facm = 1.0 - tint;
-
-		Col[0] = facm * (Color1[0]) + tint * Color2[0];
-		Col[1] = facm * (Color1[1]) + tint * Color2[1];
-		Col[2] = facm * (Color1[2]) + tint * Color2[2];
+		Col = facm * Color1 + tint * Color2;
 	}
 	
-	Color = (Fac == 1.0) ? Mortar: Col;
+	Color = (Fac == 1.0) ? Mortar : Col;
 }
 
diff --git a/intern/cycles/kernel/shaders/node_brightness.osl b/intern/cycles/kernel/shaders/node_brightness.osl
index 468b0f052c3..00cfb167885 100644
--- a/intern/cycles/kernel/shaders/node_brightness.osl
+++ b/intern/cycles/kernel/shaders/node_brightness.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_bump.osl b/intern/cycles/kernel/shaders/node_bump.osl
index bbc08760cd5..9882857f2ec 100644
--- a/intern/cycles/kernel/shaders/node_bump.osl
+++ b/intern/cycles/kernel/shaders/node_bump.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_camera.osl b/intern/cycles/kernel/shaders/node_camera.osl
index 20ebb7dc095..5e90cb8b8ee 100644
--- a/intern/cycles/kernel/shaders/node_camera.osl
+++ b/intern/cycles/kernel/shaders/node_camera.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_checker_texture.osl b/intern/cycles/kernel/shaders/node_checker_texture.osl
index a6d21fd36f3..ae84c71dd42 100644
--- a/intern/cycles/kernel/shaders/node_checker_texture.osl
+++ b/intern/cycles/kernel/shaders/node_checker_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_color.h b/intern/cycles/kernel/shaders/node_color.h
index 095e628f20c..4a17286a07f 100644
--- a/intern/cycles/kernel/shaders/node_color.h
+++ b/intern/cycles/kernel/shaders/node_color.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 float color_srgb_to_scene_linear(float c)
diff --git a/intern/cycles/kernel/shaders/node_combine_hsv.osl b/intern/cycles/kernel/shaders/node_combine_hsv.osl
index 010773acc5c..6b922bf4e6b 100644
--- a/intern/cycles/kernel/shaders/node_combine_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_combine_hsv.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_combine_rgb.osl b/intern/cycles/kernel/shaders/node_combine_rgb.osl
index 8466a89b536..f343fdefd84 100644
--- a/intern/cycles/kernel/shaders/node_combine_rgb.osl
+++ b/intern/cycles/kernel/shaders/node_combine_rgb.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_combine_xyz.osl b/intern/cycles/kernel/shaders/node_combine_xyz.osl
index 933dee5bd78..86182056b09 100644
--- a/intern/cycles/kernel/shaders/node_combine_xyz.osl
+++ b/intern/cycles/kernel/shaders/node_combine_xyz.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_convert_from_color.osl b/intern/cycles/kernel/shaders/node_convert_from_color.osl
index 2f4503e66e3..44074317f42 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_color.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_color.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_convert_from_float.osl b/intern/cycles/kernel/shaders/node_convert_from_float.osl
index f5b91903078..fc5c79c4c64 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_float.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_float.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_convert_from_int.osl b/intern/cycles/kernel/shaders/node_convert_from_int.osl
index 110922a5df1..3c3785ebc0d 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_int.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_int.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_convert_from_normal.osl b/intern/cycles/kernel/shaders/node_convert_from_normal.osl
index 995c86d8828..8ecc56ac8ce 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_normal.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_normal.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_convert_from_point.osl b/intern/cycles/kernel/shaders/node_convert_from_point.osl
index 2ed151273a8..e5913b7a1e4 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_point.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_point.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_convert_from_string.osl b/intern/cycles/kernel/shaders/node_convert_from_string.osl
index 50cce252be4..0466734277b 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_string.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_string.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_convert_from_vector.osl b/intern/cycles/kernel/shaders/node_convert_from_vector.osl
index 035c46625a0..79c5cb04550 100644
--- a/intern/cycles/kernel/shaders/node_convert_from_vector.osl
+++ b/intern/cycles/kernel/shaders/node_convert_from_vector.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl b/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl
index e8c94660e4f..2bef2d65baa 100644
--- a/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_diffuse_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_emission.osl b/intern/cycles/kernel/shaders/node_emission.osl
index b28d731c19f..c36e2a4c0f3 100644
--- a/intern/cycles/kernel/shaders/node_emission.osl
+++ b/intern/cycles/kernel/shaders/node_emission.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_environment_texture.osl b/intern/cycles/kernel/shaders/node_environment_texture.osl
index 136ccdf8b18..14f0226a0e5 100644
--- a/intern/cycles/kernel/shaders/node_environment_texture.osl
+++ b/intern/cycles/kernel/shaders/node_environment_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_fresnel.h b/intern/cycles/kernel/shaders/node_fresnel.h
index 9f10ba8023e..de2d40a849c 100644
--- a/intern/cycles/kernel/shaders/node_fresnel.h
+++ b/intern/cycles/kernel/shaders/node_fresnel.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
  
 float fresnel_dielectric_cos(float cosi, float eta)
@@ -36,14 +36,14 @@ float fresnel_dielectric_cos(float cosi, float eta)
 
 color fresnel_conductor(float cosi, color eta, color k)
 {
-	color cosi2 = color(cosi*cosi);
+	color cosi2 = color(cosi * cosi);
 	color one = color(1, 1, 1);
 	color tmp_f = eta * eta + k * k;
 	color tmp = tmp_f * cosi2;
 	color Rparl2 = (tmp - (2.0 * eta * cosi) + one) /
-					(tmp + (2.0 * eta * cosi) + one);
+	               (tmp + (2.0 * eta * cosi) + one);
 	color Rperp2 = (tmp_f - (2.0 * eta * cosi) + cosi2) /
-					(tmp_f + (2.0 * eta * cosi) + cosi2);
+	               (tmp_f + (2.0 * eta * cosi) + cosi2);
 	return (Rparl2 + Rperp2) * 0.5;
 }
 
diff --git a/intern/cycles/kernel/shaders/node_fresnel.osl b/intern/cycles/kernel/shaders/node_fresnel.osl
index 7ef553c0f39..8bec7b432f5 100644
--- a/intern/cycles/kernel/shaders/node_fresnel.osl
+++ b/intern/cycles/kernel/shaders/node_fresnel.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_gamma.osl b/intern/cycles/kernel/shaders/node_gamma.osl
index a2ad3f766fe..bc4c1b34266 100644
--- a/intern/cycles/kernel/shaders/node_gamma.osl
+++ b/intern/cycles/kernel/shaders/node_gamma.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_geometry.osl b/intern/cycles/kernel/shaders/node_geometry.osl
index 7bef2051865..b0bd7692489 100644
--- a/intern/cycles/kernel/shaders/node_geometry.osl
+++ b/intern/cycles/kernel/shaders/node_geometry.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -26,7 +26,8 @@ shader node_geometry(
 	output normal TrueNormal = normal(0.0, 0.0, 0.0),
 	output vector Incoming = vector(0.0, 0.0, 0.0),
 	output point Parametric = point(0.0, 0.0, 0.0),
-	output float Backfacing = 0.0)
+	output float Backfacing = 0.0,
+	output float Pointiness = 0.0)
 {
 	Position = P;
 	Normal = NormalIn;
@@ -49,7 +50,7 @@ shader node_geometry(
 
 	/* try to create spherical tangent from generated coordinates */
 	if (getattribute("geom:generated", generated)) {
-		normal data = normal(-(generated[1]-0.5), (generated[0]-0.5), 0.0);
+		normal data = normal(-(generated[1] - 0.5), (generated[0] - 0.5), 0.0);
 		vector T = transform("object", "world", data);
 		Tangent = cross(Normal, normalize(cross(T, Normal)));
 	}
@@ -57,5 +58,13 @@ shader node_geometry(
 		/* otherwise use surface derivatives */
 		Tangent = normalize(dPdu);
 	}
+
+	getattribute("geom:pointiness", Pointiness);
+	if (bump_offset == "dx") {
+		Pointiness += Dx(Pointiness);
+	}
+	else if (bump_offset == "dy") {
+		Pointiness += Dy(Pointiness);
+	}
 }
 
diff --git a/intern/cycles/kernel/shaders/node_glass_bsdf.osl b/intern/cycles/kernel/shaders/node_glass_bsdf.osl
index b3d6133553b..68bc107cc5e 100644
--- a/intern/cycles/kernel/shaders/node_glass_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_glass_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
index 5c727ca6917..d3250b32d0b 100644
--- a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_gradient_texture.osl b/intern/cycles/kernel/shaders/node_gradient_texture.osl
index 5aa05917dc2..52b49688ab3 100644
--- a/intern/cycles/kernel/shaders/node_gradient_texture.osl
+++ b/intern/cycles/kernel/shaders/node_gradient_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_hair_info.osl b/intern/cycles/kernel/shaders/node_hair_info.osl
index 1d1ba1983e1..965d2a3c7f7 100644
--- a/intern/cycles/kernel/shaders/node_hair_info.osl
+++ b/intern/cycles/kernel/shaders/node_hair_info.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_holdout.osl b/intern/cycles/kernel/shaders/node_holdout.osl
index cafad1b5757..78a9f46fd15 100644
--- a/intern/cycles/kernel/shaders/node_holdout.osl
+++ b/intern/cycles/kernel/shaders/node_holdout.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_hsv.osl b/intern/cycles/kernel/shaders/node_hsv.osl
index 4722bde4cd7..8d9e50fed6b 100644
--- a/intern/cycles/kernel/shaders/node_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_hsv.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -35,6 +35,11 @@ shader node_hsv(
 
 	Color = hsv_to_rgb(Color);
 
+	// Clamp color to prevent negative values cauzed by oversaturation.
+	Color[0] = max(Color[0], 0.0);
+	Color[1] = max(Color[1], 0.0);
+	Color[2] = max(Color[2], 0.0);
+
 	ColorOut = mix(ColorIn, Color, Fac);
 }
 
diff --git a/intern/cycles/kernel/shaders/node_image_texture.osl b/intern/cycles/kernel/shaders/node_image_texture.osl
index 7238a1e8862..d3a347b70db 100644
--- a/intern/cycles/kernel/shaders/node_image_texture.osl
+++ b/intern/cycles/kernel/shaders/node_image_texture.osl
@@ -11,15 +11,60 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
 #include "node_color.h"
 
-color image_texture_lookup(string filename, string color_space, float u, float v, output float Alpha, int use_alpha, int is_float, string interpolation)
+point texco_remap_square(point co)
 {
-	color rgb = (color)texture(filename, u, 1.0 - v, "wrap", "periodic", "interp", interpolation, "alpha", Alpha);
+	return (co - point(0.5, 0.5, 0.5)) * 2.0;
+}
+
+point map_to_tube(vector dir)
+{
+	float u, v;
+	v = (dir[2] + 1.0) * 0.5;
+	float len = sqrt(dir[0] * dir[0] + dir[1] * dir[1]);
+	if (len > 0.0) {
+		u = (1.0 - (atan2(dir[0] / len, dir[1] / len) / M_PI)) * 0.5;
+	}
+	else {
+		v = u = 0.0; /* To avoid un-initialized variables. */
+	}
+	return point(u, v, 0.0);
+}
+
+point map_to_sphere(vector dir)
+{
+	float len = length(dir);
+	float v, u;
+	if (len > 0.0) {
+		if (dir[0] == 0.0 && dir[1] == 0.0) {
+			u = 0.0;  /* Othwise domain error. */
+		}
+		else {
+			u = (1.0 - atan2(dir[0], dir[1]) / M_PI) / 2.0;
+		}
+		v = 1.0 - acos(dir[2] / len) / M_PI;
+	}
+	else {
+		v = u = 0.0;  /* To avoid un-initialized variables. */
+	}
+	return point(u, v, 0.0);
+}
+
+color image_texture_lookup(string filename,
+                           string color_space,
+                           float u, float v,
+                           output float Alpha,
+                           int use_alpha,
+                           int is_float,
+                           string interpolation,
+                           string wrap)
+{
+	color rgb = (color)texture(filename, u, 1.0 - v, "wrap", wrap, "interp", interpolation, "alpha", Alpha);
 
 	if (use_alpha) {
 		rgb = color_unpremultiply(rgb, Alpha);
@@ -43,6 +88,7 @@ shader node_image_texture(
 	string color_space = "sRGB",
 	string projection = "Flat",
 	string interpolation = "smartcubic",
+	string wrap = "periodic",
 	float projection_blend = 0.0,
 	int is_float = 1,
 	int use_alpha = 1,
@@ -55,7 +101,14 @@ shader node_image_texture(
 		p = transform(mapping, p);
 	
 	if (projection == "Flat") {
-		Color = image_texture_lookup(filename, color_space, p[0], p[1], Alpha, use_alpha, is_float, interpolation);
+		Color = image_texture_lookup(filename,
+		                             color_space,
+		                             p[0], p[1],
+		                             Alpha,
+		                             use_alpha,
+		                             is_float,
+		                             interpolation,
+		                             wrap);
 	}
 	else if (projection == "Box") {
 		/* object space normal */
@@ -113,6 +166,10 @@ shader node_image_texture(
 				weight[2] = ((2.0 - limit) * Nob[2] + (limit - 1.0)) / (2.0 * limit - 1.0);
 			}
 		}
+		else {
+			/* Desperate mode, no valid choice anyway, fallback to one side.*/
+			weight[0] = 1.0;
+		}
 
 		Color = color(0.0, 0.0, 0.0);
 		Alpha = 0.0;
@@ -120,17 +177,59 @@ shader node_image_texture(
 		float tmp_alpha;
 
 		if (weight[0] > 0.0) {
-			Color += weight[0] * image_texture_lookup(filename, color_space, p[1], p[2], tmp_alpha, use_alpha, is_float, interpolation);
+			Color += weight[0] * image_texture_lookup(filename,
+			                                          color_space,
+			                                          p[1], p[2],
+			                                          tmp_alpha,
+			                                          use_alpha,
+			                                          is_float,
+			                                          interpolation,
+			                                          wrap);
 			Alpha += weight[0] * tmp_alpha;
 		}
 		if (weight[1] > 0.0) {
-			Color += weight[1] * image_texture_lookup(filename, color_space, p[0], p[2], tmp_alpha, use_alpha, is_float, interpolation);
+			Color += weight[1] * image_texture_lookup(filename,
+			                                          color_space,
+			                                          p[0], p[2],
+			                                          tmp_alpha,
+			                                          use_alpha,
+			                                          is_float,
+			                                          interpolation,
+			                                          wrap);
 			Alpha += weight[1] * tmp_alpha;
 		}
 		if (weight[2] > 0.0) {
-			Color += weight[2] * image_texture_lookup(filename, color_space, p[1], p[0], tmp_alpha, use_alpha, is_float, interpolation);
+			Color += weight[2] * image_texture_lookup(filename,
+			                                          color_space,
+			                                          p[1], p[0],
+			                                          tmp_alpha,
+			                                          use_alpha,
+			                                          is_float,
+			                                          interpolation,
+			                                          wrap);
 			Alpha += weight[2] * tmp_alpha;
 		}
 	}
+	else if (projection == "Sphere") {
+		point projected = map_to_sphere(texco_remap_square(p));
+		Color = image_texture_lookup(filename,
+		                             color_space,
+		                             projected[0], projected[1],
+		                             Alpha,
+		                             use_alpha,
+		                             is_float,
+		                             interpolation,
+		                             wrap);
+	}
+	else if (projection == "Tube") {
+		point projected = map_to_tube(texco_remap_square(p));
+		Color = image_texture_lookup(filename,
+		                             color_space,
+		                             projected[0], projected[1],
+		                             Alpha,
+		                             use_alpha,
+		                             is_float,
+		                             interpolation,
+		                             wrap);
+	}
 }
-
diff --git a/intern/cycles/kernel/shaders/node_invert.osl b/intern/cycles/kernel/shaders/node_invert.osl
index 81ef2d0dc3d..b33b0a43d63 100644
--- a/intern/cycles/kernel/shaders/node_invert.osl
+++ b/intern/cycles/kernel/shaders/node_invert.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_layer_weight.osl b/intern/cycles/kernel/shaders/node_layer_weight.osl
index d03ebe2239a..f583df25773 100644
--- a/intern/cycles/kernel/shaders/node_layer_weight.osl
+++ b/intern/cycles/kernel/shaders/node_layer_weight.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_light_falloff.osl b/intern/cycles/kernel/shaders/node_light_falloff.osl
index 311b87f3764..a594e33d643 100644
--- a/intern/cycles/kernel/shaders/node_light_falloff.osl
+++ b/intern/cycles/kernel/shaders/node_light_falloff.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_light_path.osl b/intern/cycles/kernel/shaders/node_light_path.osl
index 95fbcabf917..99a92c4f403 100644
--- a/intern/cycles/kernel/shaders/node_light_path.osl
+++ b/intern/cycles/kernel/shaders/node_light_path.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_magic_texture.osl b/intern/cycles/kernel/shaders/node_magic_texture.osl
index b8afc6e29ac..c09523f205b 100644
--- a/intern/cycles/kernel/shaders/node_magic_texture.osl
+++ b/intern/cycles/kernel/shaders/node_magic_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_mapping.osl b/intern/cycles/kernel/shaders/node_mapping.osl
index 46ff9f05e07..69106957ee4 100644
--- a/intern/cycles/kernel/shaders/node_mapping.osl
+++ b/intern/cycles/kernel/shaders/node_mapping.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_math.osl b/intern/cycles/kernel/shaders/node_math.osl
index abb6a359e75..7eef97fd7e8 100644
--- a/intern/cycles/kernel/shaders/node_math.osl
+++ b/intern/cycles/kernel/shaders/node_math.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -93,8 +93,8 @@ shader node_math(
 		Value = Value1 > Value2;
 	else if (type == "Modulo")
 		Value = safe_modulo(Value1, Value2);
-    else if (type == "Absolute")
-        Value = fabs(Value1);
+	else if (type == "Absolute")
+		Value = fabs(Value1);
 
 	if (Clamp)
 		Value = clamp(Value, 0.0, 1.0);
diff --git a/intern/cycles/kernel/shaders/node_mix.osl b/intern/cycles/kernel/shaders/node_mix.osl
index dd54fd814de..9ef58e4cbba 100644
--- a/intern/cycles/kernel/shaders/node_mix.osl
+++ b/intern/cycles/kernel/shaders/node_mix.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_mix_closure.osl b/intern/cycles/kernel/shaders/node_mix_closure.osl
index 79d71c97371..5946dfdaaba 100644
--- a/intern/cycles/kernel/shaders/node_mix_closure.osl
+++ b/intern/cycles/kernel/shaders/node_mix_closure.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_musgrave_texture.osl b/intern/cycles/kernel/shaders/node_musgrave_texture.osl
index 60762539002..4f95dec910a 100644
--- a/intern/cycles/kernel/shaders/node_musgrave_texture.osl
+++ b/intern/cycles/kernel/shaders/node_musgrave_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -26,7 +26,7 @@
  * from "Texturing and Modelling: A procedural approach"
  */
 
-float noise_musgrave_fBm(point p, string basis, float H, float lacunarity, float octaves)
+float noise_musgrave_fBm(point p, float H, float lacunarity, float octaves)
 {
 	float rmd;
 	float value = 0.0;
@@ -35,14 +35,14 @@ float noise_musgrave_fBm(point p, string basis, float H, float lacunarity, float
 	int i;
 
 	for (i = 0; i < (int)octaves; i++) {
-		value += safe_noise(p, 0) * pwr;
+		value += safe_noise(p, "signed") * pwr;
 		pwr *= pwHL;
 		p *= lacunarity;
 	}
 
 	rmd = octaves - floor(octaves);
 	if (rmd != 0.0)
-		value += rmd * safe_noise(p, 0) * pwr;
+		value += rmd * safe_noise(p, "signed") * pwr;
 
 	return value;
 }
@@ -54,7 +54,7 @@ float noise_musgrave_fBm(point p, string basis, float H, float lacunarity, float
  * octaves: number of frequencies in the fBm
  */
 
-float noise_musgrave_multi_fractal(point p, string basis, float H, float lacunarity, float octaves)
+float noise_musgrave_multi_fractal(point p, float H, float lacunarity, float octaves)
 {
 	float rmd;
 	float value = 1.0;
@@ -63,14 +63,14 @@ float noise_musgrave_multi_fractal(point p, string basis, float H, float lacunar
 	int i;
 
 	for (i = 0; i < (int)octaves; i++) {
-		value *= (pwr * safe_noise(p, 0) + 1.0);
+		value *= (pwr * safe_noise(p, "signed") + 1.0);
 		pwr *= pwHL;
 		p *= lacunarity;
 	}
 
 	rmd = octaves - floor(octaves);
 	if (rmd != 0.0)
-		value *= (rmd * pwr * safe_noise(p, 0) + 1.0); /* correct? */
+		value *= (rmd * pwr * safe_noise(p, "signed") + 1.0); /* correct? */
 
 	return value;
 }
@@ -83,7 +83,7 @@ float noise_musgrave_multi_fractal(point p, string basis, float H, float lacunar
  * offset: raises the terrain from `sea level'
  */
 
-float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacunarity, float octaves, float offset)
+float noise_musgrave_hetero_terrain(point p, float H, float lacunarity, float octaves, float offset)
 {
 	float value, increment, rmd;
 	float pwHL = pow(lacunarity, -H);
@@ -91,11 +91,11 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna
 	int i;
 
 	/* first unscaled octave of function; later octaves are scaled */
-	value = offset + safe_noise(p, 0);
+	value = offset + safe_noise(p, "signed");
 	p *= lacunarity;
 
 	for (i = 1; i < (int)octaves; i++) {
-		increment = (safe_noise(p, 0) + offset) * pwr * value;
+		increment = (safe_noise(p, "signed") + offset) * pwr * value;
 		value += increment;
 		pwr *= pwHL;
 		p *= lacunarity;
@@ -103,7 +103,7 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna
 
 	rmd = octaves - floor(octaves);
 	if (rmd != 0.0) {
-		increment = (safe_noise(p, 0) + offset) * pwr * value;
+		increment = (safe_noise(p, "signed") + offset) * pwr * value;
 		value += rmd * increment;
 	}
 
@@ -118,15 +118,15 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna
  * offset: raises the terrain from `sea level'
  */
 
-float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
-                                          float lacunarity, float octaves, float offset, float gain)
+float noise_musgrave_hybrid_multi_fractal(point p, float H, float lacunarity,
+                                          float octaves, float offset, float gain)
 {
 	float result, signal, weight, rmd;
 	float pwHL = pow(lacunarity, -H);
 	float pwr = pwHL;
 	int i;
 
-	result = safe_noise(p, 0) + offset;
+	result = safe_noise(p, "signed") + offset;
 	weight = gain * result;
 	p *= lacunarity;
 
@@ -134,7 +134,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
 		if (weight > 1.0)
 			weight = 1.0;
 
-		signal = (safe_noise(p, 0) + offset) * pwr;
+		signal = (safe_noise(p, "signed") + offset) * pwr;
 		pwr *= pwHL;
 		result += weight * signal;
 		weight *= gain * signal;
@@ -143,7 +143,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
 
 	rmd = octaves - floor(octaves);
 	if (rmd != 0.0)
-		result += rmd * ((safe_noise(p, 0) + offset) * pwr);
+		result += rmd * ((safe_noise(p, "signed") + offset) * pwr);
 
 	return result;
 }
@@ -156,15 +156,15 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
  * offset: raises the terrain from `sea level'
  */
 
-float noise_musgrave_ridged_multi_fractal(point p, string basis, float H,
-                                          float lacunarity, float octaves, float offset, float gain)
+float noise_musgrave_ridged_multi_fractal(point p, float H, float lacunarity,
+                                          float octaves, float offset, float gain)
 {
 	float result, signal, weight;
 	float pwHL = pow(lacunarity, -H);
 	float pwr = pwHL;
 	int i;
 
-	signal = offset - fabs(safe_noise(p, 0));
+	signal = offset - fabs(safe_noise(p, "signed"));
 	signal *= signal;
 	result = signal;
 	weight = 1.0;
@@ -172,7 +172,7 @@ float noise_musgrave_ridged_multi_fractal(point p, string basis, float H,
 	for (i = 1; i < (int)octaves; i++) {
 		p *= lacunarity;
 		weight = clamp(signal * gain, 0.0, 1.0);
-		signal = offset - fabs(safe_noise(p, 0));
+		signal = offset - fabs(safe_noise(p, "signed"));
 		signal *= signal;
 		signal *= weight;
 		result += signal * pwr;
@@ -201,7 +201,6 @@ shader node_musgrave_texture(
 	float dimension = max(Dimension, 1e-5);
 	float octaves = clamp(Detail, 0.0, 16.0);
 	float lacunarity = max(Lacunarity, 1e-5);
-	string Basis = "Perlin";
 	float intensity = 1.0;
 
 	point p = Vector;
@@ -212,15 +211,15 @@ shader node_musgrave_texture(
 	p = p * Scale;
 
 	if (Type == "Multifractal")
-		Fac = intensity * noise_musgrave_multi_fractal(p, Basis, dimension, lacunarity, octaves);
+		Fac = intensity * noise_musgrave_multi_fractal(p, dimension, lacunarity, octaves);
 	else if (Type == "fBM")
-		Fac = intensity * noise_musgrave_fBm(p, Basis, dimension, lacunarity, octaves);
+		Fac = intensity * noise_musgrave_fBm(p, dimension, lacunarity, octaves);
 	else if (Type == "Hybrid Multifractal")
-		Fac = intensity * noise_musgrave_hybrid_multi_fractal(p, Basis, dimension, lacunarity, octaves, Offset, Gain);
+		Fac = intensity * noise_musgrave_hybrid_multi_fractal(p, dimension, lacunarity, octaves, Offset, Gain);
 	else if (Type == "Ridged Multifractal")
-		Fac = intensity * noise_musgrave_ridged_multi_fractal(p, Basis, dimension, lacunarity, octaves, Offset, Gain);
+		Fac = intensity * noise_musgrave_ridged_multi_fractal(p, dimension, lacunarity, octaves, Offset, Gain);
 	else if (Type == "Hetero Terrain")
-		Fac = intensity * noise_musgrave_hetero_terrain(p, Basis, dimension, lacunarity, octaves, Offset);
+		Fac = intensity * noise_musgrave_hetero_terrain(p, dimension, lacunarity, octaves, Offset);
 	
 	Color = color(Fac, Fac, Fac);
 }
diff --git a/intern/cycles/kernel/shaders/node_noise_texture.osl b/intern/cycles/kernel/shaders/node_noise_texture.osl
index 912795966e0..e83e5b5b211 100644
--- a/intern/cycles/kernel/shaders/node_noise_texture.osl
+++ b/intern/cycles/kernel/shaders/node_noise_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -19,23 +19,23 @@
 
 /* Noise */
 
-float noise(point p, string basis, float distortion, float detail, float fac, color Color)
+float noise(point p, float distortion, float detail, float fac, color Color)
 {
 	point r;
 	int hard = 0;
 
 	if (distortion != 0.0) {
-		r[0] = noise_basis(p + point(13.5), basis) * distortion;
-		r[1] = noise_basis(p, basis) * distortion;
-		r[2] = noise_basis(p - point(13.5), basis) * distortion;
+		r[0] = safe_noise(p + point(13.5), "unsigned") * distortion;
+		r[1] = safe_noise(p, "unsigned") * distortion;
+		r[2] = safe_noise(p - point(13.5), "unsigned") * distortion;
 		
 		p += r;
 	}
 
-	fac = noise_turbulence(p, basis, detail, hard);
+	fac = noise_turbulence(p, detail, hard);
 	
-	Color = color(fac, noise_turbulence(point(p[1], p[0], p[2]), basis, detail, hard),
-		noise_turbulence(point(p[1], p[2], p[0]), basis, detail, hard));
+	Color = color(fac, noise_turbulence(point(p[1], p[0], p[2]), detail, hard),
+		noise_turbulence(point(p[1], p[2], p[0]), detail, hard));
 
 	return fac;
 }
@@ -55,7 +55,6 @@ shader node_noise_texture(
 	if (use_mapping)
 		p = transform(mapping, p);
 
-	string Basis = "Perlin";
-	Fac = noise(p * Scale, Basis, Distortion, Detail, Fac, Color);
+	Fac = noise(p * Scale, Distortion, Detail, Fac, Color);
 }
 
diff --git a/intern/cycles/kernel/shaders/node_normal.osl b/intern/cycles/kernel/shaders/node_normal.osl
index 14af044e0c0..2d04978fc72 100644
--- a/intern/cycles/kernel/shaders/node_normal.osl
+++ b/intern/cycles/kernel/shaders/node_normal.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -23,6 +23,6 @@ shader node_normal(
 	output float Dot = 1.0)
 {
 	NormalOut = normalize(Direction);
-	Dot = dot(NormalOut, NormalIn);
+	Dot = dot(NormalOut, normalize(NormalIn));
 }
 
diff --git a/intern/cycles/kernel/shaders/node_normal_map.osl b/intern/cycles/kernel/shaders/node_normal_map.osl
index c2080ecb194..01be566fb20 100644
--- a/intern/cycles/kernel/shaders/node_normal_map.osl
+++ b/intern/cycles/kernel/shaders/node_normal_map.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_object_info.osl b/intern/cycles/kernel/shaders/node_object_info.osl
index 1ebe767e82d..dd7c663b8d8 100644
--- a/intern/cycles/kernel/shaders/node_object_info.osl
+++ b/intern/cycles/kernel/shaders/node_object_info.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_output_displacement.osl b/intern/cycles/kernel/shaders/node_output_displacement.osl
index 613d6be5f3b..d0688cfda8d 100644
--- a/intern/cycles/kernel/shaders/node_output_displacement.osl
+++ b/intern/cycles/kernel/shaders/node_output_displacement.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_output_surface.osl b/intern/cycles/kernel/shaders/node_output_surface.osl
index fb16e85ce0d..2cc4575a8c8 100644
--- a/intern/cycles/kernel/shaders/node_output_surface.osl
+++ b/intern/cycles/kernel/shaders/node_output_surface.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_output_volume.osl b/intern/cycles/kernel/shaders/node_output_volume.osl
index 11a884b7d75..f220ba866e3 100644
--- a/intern/cycles/kernel/shaders/node_output_volume.osl
+++ b/intern/cycles/kernel/shaders/node_output_volume.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_particle_info.osl b/intern/cycles/kernel/shaders/node_particle_info.osl
index 077b0c114da..768b7753d02 100644
--- a/intern/cycles/kernel/shaders/node_particle_info.osl
+++ b/intern/cycles/kernel/shaders/node_particle_info.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_refraction_bsdf.osl b/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
index 4a32415b482..d458ca730a4 100644
--- a/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_refraction_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_rgb_curves.osl b/intern/cycles/kernel/shaders/node_rgb_curves.osl
index 4e0f8721144..60cb273ba98 100644
--- a/intern/cycles/kernel/shaders/node_rgb_curves.osl
+++ b/intern/cycles/kernel/shaders/node_rgb_curves.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_rgb_ramp.osl b/intern/cycles/kernel/shaders/node_rgb_ramp.osl
index d3c2e9573d2..0202ba0bf79 100644
--- a/intern/cycles/kernel/shaders/node_rgb_ramp.osl
+++ b/intern/cycles/kernel/shaders/node_rgb_ramp.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_scatter_volume.osl b/intern/cycles/kernel/shaders/node_scatter_volume.osl
index 77c157bd92b..002e2750fca 100644
--- a/intern/cycles/kernel/shaders/node_scatter_volume.osl
+++ b/intern/cycles/kernel/shaders/node_scatter_volume.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_separate_hsv.osl b/intern/cycles/kernel/shaders/node_separate_hsv.osl
index 94fc5de9122..2a804040294 100644
--- a/intern/cycles/kernel/shaders/node_separate_hsv.osl
+++ b/intern/cycles/kernel/shaders/node_separate_hsv.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_separate_rgb.osl b/intern/cycles/kernel/shaders/node_separate_rgb.osl
index aebb63a0ee4..43d9e3aa4b1 100644
--- a/intern/cycles/kernel/shaders/node_separate_rgb.osl
+++ b/intern/cycles/kernel/shaders/node_separate_rgb.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_separate_xyz.osl b/intern/cycles/kernel/shaders/node_separate_xyz.osl
index 63725cb9995..e1963a1902f 100644
--- a/intern/cycles/kernel/shaders/node_separate_xyz.osl
+++ b/intern/cycles/kernel/shaders/node_separate_xyz.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_set_normal.osl b/intern/cycles/kernel/shaders/node_set_normal.osl
index 8eef152308a..7ca7ac9350c 100644
--- a/intern/cycles/kernel/shaders/node_set_normal.osl
+++ b/intern/cycles/kernel/shaders/node_set_normal.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_sky_texture.osl b/intern/cycles/kernel/shaders/node_sky_texture.osl
index 85c2dbdb2c2..05eed23bea8 100644
--- a/intern/cycles/kernel/shaders/node_sky_texture.osl
+++ b/intern/cycles/kernel/shaders/node_sky_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
index 1c0cd74c0be..dbbf657776c 100644
--- a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
+++ b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_tangent.osl b/intern/cycles/kernel/shaders/node_tangent.osl
index 41a2b2b0216..53a47396f9f 100644
--- a/intern/cycles/kernel/shaders/node_tangent.osl
+++ b/intern/cycles/kernel/shaders/node_tangent.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_texture.h b/intern/cycles/kernel/shaders/node_texture.h
index de51559f297..fc2cfdcd55c 100644
--- a/intern/cycles/kernel/shaders/node_texture.h
+++ b/intern/cycles/kernel/shaders/node_texture.h
@@ -11,35 +11,9 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
-/* Voronoi Distances */
-
-float voronoi_distance(string distance_metric, vector d, float e)
-{
-#if 0
-	if (distance_metric == "Distance Squared")
-#endif
-		return dot(d, d);
-#if 0
-	if (distance_metric == "Actual Distance")
-		return length(d);
-	if (distance_metric == "Manhattan")
-		return fabs(d[0]) + fabs(d[1]) + fabs(d[2]);
-	if (distance_metric == "Chebychev")
-		return max(fabs(d[0]), max(fabs(d[1]), fabs(d[2])));
-	if (distance_metric == "Minkovsky 1/2")
-		return sqrt(fabs(d[0])) + sqrt(fabs(d[1])) + sqrt(fabs(d[1]));
-	if (distance_metric == "Minkovsky 4")
-		return sqrt(sqrt(dot(d * d, d * d)));
-	if (distance_metric == "Minkovsky")
-		return pow(pow(fabs(d[0]), e) + pow(fabs(d[1]), e) + pow(fabs(d[2]), e), 1.0 / e);
-	
-	return 0.0;
-#endif
-}
-
 /* Voronoi / Worley like */
 
 color cellnoise_color(point p)
@@ -51,7 +25,7 @@ color cellnoise_color(point p)
 	return color(r, g, b);
 }
 
-void voronoi(point p, string distance_metric, float e, float da[4], point pa[4])
+void voronoi(point p, float e, float da[4], point pa[4])
 {
 	/* returns distances in da and point coords in pa */
 	int xx, yy, zz, xi, yi, zi;
@@ -71,7 +45,7 @@ void voronoi(point p, string distance_metric, float e, float da[4], point pa[4])
 				point ip = point(xx, yy, zz);
 				point vp = (point)cellnoise_color(ip);
 				point pd = p - (vp + ip);
-				float d = voronoi_distance(distance_metric, pd, e);
+				float d = dot(pd, pd);
 
 				vp += point(xx, yy, zz);
 
@@ -111,54 +85,14 @@ void voronoi(point p, string distance_metric, float e, float da[4], point pa[4])
 	}
 }
 
-float voronoi_Fn(point p, int n)
-{
-	float da[4];
-	point pa[4];
-
-	voronoi(p, "Distance Squared", 0, da, pa);
-
-	return da[n];
-}
-
-float voronoi_FnFn(point p, int n1, int n2)
-{
-	float da[4];
-	point pa[4];
-
-	voronoi(p, "Distance Squared", 0, da, pa);
-
-	return da[n2] - da[n1];
-}
-
-float voronoi_F1(point p) { return voronoi_Fn(p, 0); }
-float voronoi_F2(point p) { return voronoi_Fn(p, 1); }
-float voronoi_F3(point p) { return voronoi_Fn(p, 2); }
-float voronoi_F4(point p) { return voronoi_Fn(p, 3); }
-float voronoi_F1F2(point p) { return voronoi_FnFn(p, 0, 1); }
-
-float voronoi_Cr(point p)
-{
-	/* crackle type pattern, just a scale/clamp of F2-F1 */
-	float t = 10.0 * voronoi_F1F2(p);
-	return (t > 1.0) ? 1.0 : t;
-}
-
-float voronoi_F1S(point p) { return 2.0 * voronoi_F1(p) - 1.0; }
-float voronoi_F2S(point p) { return 2.0 * voronoi_F2(p) - 1.0; }
-float voronoi_F3S(point p) { return 2.0 * voronoi_F3(p) - 1.0; }
-float voronoi_F4S(point p) { return 2.0 * voronoi_F4(p) - 1.0; }
-float voronoi_F1F2S(point p) { return 2.0 * voronoi_F1F2(p) - 1.0; }
-float voronoi_CrS(point p) { return 2.0 * voronoi_Cr(p) - 1.0; }
-
 /* Noise Bases */
 
-float safe_noise(point p, int type)
+float safe_noise(point p, string type)
 {
 	float f = 0.0;
 	
 	/* Perlin noise in range -1..1 */
-	if (type == 0)
+	if (type == "signed")
 		f = noise("perlin", p);
 	
 	/* Perlin noise in range 0..1 */
@@ -172,39 +106,9 @@ float safe_noise(point p, int type)
 	return f;
 }
 
-float noise_basis(point p, string basis)
-{
-	if (basis == "Perlin")
-		return safe_noise(p, 1);
-	if (basis == "Voronoi F1")
-		return voronoi_F1S(p);
-	if (basis == "Voronoi F2")
-		return voronoi_F2S(p);
-	if (basis == "Voronoi F3")
-		return voronoi_F3S(p);
-	if (basis == "Voronoi F4")
-		return voronoi_F4S(p);
-	if (basis == "Voronoi F2-F1")
-		return voronoi_F1F2S(p);
-	if (basis == "Voronoi Crackle")
-		return voronoi_CrS(p);
-	if (basis == "Cell Noise")
-		return cellnoise(p);
-	
-	return 0.0;
-}
-
-/* Soft/Hard Noise */
-
-float noise_basis_hard(point p, string basis, int hard)
-{
-	float t = noise_basis(p, basis);
-	return (hard) ? fabs(2.0 * t - 1.0) : t;
-}
-
 /* Turbulence */
 
-float noise_turbulence(point p, string basis, float details, int hard)
+float noise_turbulence(point p, float details, int hard)
 {
 	float fscale = 1.0;
 	float amp = 1.0;
@@ -215,7 +119,7 @@ float noise_turbulence(point p, string basis, float details, int hard)
 	n = (int)octaves;
 
 	for (i = 0; i <= n; i++) {
-		float t = noise_basis(fscale * p, basis);
+		float t = safe_noise(fscale * p, "unsigned");
 
 		if (hard)
 			t = fabs(2.0 * t - 1.0);
@@ -228,7 +132,7 @@ float noise_turbulence(point p, string basis, float details, int hard)
 	float rmd = octaves - floor(octaves);
 
 	if (rmd != 0.0) {
-		float t = noise_basis(fscale * p, basis);
+		float t = safe_noise(fscale * p, "unsigned");
 
 		if (hard)
 			t = fabs(2.0 * t - 1.0);
diff --git a/intern/cycles/kernel/shaders/node_texture_coordinate.osl b/intern/cycles/kernel/shaders/node_texture_coordinate.osl
index 8fdf469df21..9e2109fa082 100644
--- a/intern/cycles/kernel/shaders/node_texture_coordinate.osl
+++ b/intern/cycles/kernel/shaders/node_texture_coordinate.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -21,7 +21,9 @@ shader node_texture_coordinate(
 	int is_background = 0,
 	int is_volume = 0,
 	int from_dupli = 0,
+	int use_transform = 0,
 	string bump_offset = "center",
+	matrix object_itfm = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 
 	output point Generated = point(0.0, 0.0, 0.0),
 	output point UV = point(0.0, 0.0, 0.0),
@@ -60,7 +62,12 @@ shader node_texture_coordinate(
 			getattribute("geom:uv", UV);
 		}
 
-		Object = transform("object", P);
+		if (use_transform) {
+			Object = transform(object_itfm, P);
+		}
+		else {
+			Object = transform("object", P);
+		}
 		Camera = transform("camera", P);
 		Window = transform("NDC", P);
 		Normal = transform("world", "object", NormalIn);
diff --git a/intern/cycles/kernel/shaders/node_toon_bsdf.osl b/intern/cycles/kernel/shaders/node_toon_bsdf.osl
index 1f7e1b8e6e1..75c5d06f847 100644
--- a/intern/cycles/kernel/shaders/node_toon_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_toon_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_translucent_bsdf.osl b/intern/cycles/kernel/shaders/node_translucent_bsdf.osl
index 8059f5788ec..94d23d35326 100644
--- a/intern/cycles/kernel/shaders/node_translucent_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_translucent_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_transparent_bsdf.osl b/intern/cycles/kernel/shaders/node_transparent_bsdf.osl
index 552e4106b0c..5d6798f19a6 100644
--- a/intern/cycles/kernel/shaders/node_transparent_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_transparent_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_uv_map.osl b/intern/cycles/kernel/shaders/node_uv_map.osl
index 01c984aff4c..77e2e8d12d7 100644
--- a/intern/cycles/kernel/shaders/node_uv_map.osl
+++ b/intern/cycles/kernel/shaders/node_uv_map.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_value.osl b/intern/cycles/kernel/shaders/node_value.osl
index aebfab35d2a..f75388d1f76 100644
--- a/intern/cycles/kernel/shaders/node_value.osl
+++ b/intern/cycles/kernel/shaders/node_value.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_vector_curves.osl b/intern/cycles/kernel/shaders/node_vector_curves.osl
index 137ebe112eb..7bbf97d95ea 100644
--- a/intern/cycles/kernel/shaders/node_vector_curves.osl
+++ b/intern/cycles/kernel/shaders/node_vector_curves.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_vector_math.osl b/intern/cycles/kernel/shaders/node_vector_math.osl
index 0c8857deae2..f83412dc0f7 100644
--- a/intern/cycles/kernel/shaders/node_vector_math.osl
+++ b/intern/cycles/kernel/shaders/node_vector_math.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_vector_transform.osl b/intern/cycles/kernel/shaders/node_vector_transform.osl
index 6fb0ab1d8cc..8ebaa31ab25 100644
--- a/intern/cycles/kernel/shaders/node_vector_transform.osl
+++ b/intern/cycles/kernel/shaders/node_vector_transform.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_velvet_bsdf.osl b/intern/cycles/kernel/shaders/node_velvet_bsdf.osl
index 37b26babc64..456c26998c8 100644
--- a/intern/cycles/kernel/shaders/node_velvet_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_velvet_bsdf.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_voronoi_texture.osl b/intern/cycles/kernel/shaders/node_voronoi_texture.osl
index 7a1e0016690..29e143ae207 100644
--- a/intern/cycles/kernel/shaders/node_voronoi_texture.osl
+++ b/intern/cycles/kernel/shaders/node_voronoi_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -37,7 +37,7 @@ shader node_voronoi_texture(
 	float da[4];
 	point pa[4];
 
-	voronoi(p * Scale, "Distance Squared", 1.0, da, pa);
+	voronoi(p * Scale, 1.0, da, pa);
 
 	/* Colored output */
 	if (Coloring == "Intensity") {
diff --git a/intern/cycles/kernel/shaders/node_voxel_texture.osl b/intern/cycles/kernel/shaders/node_voxel_texture.osl
new file mode 100644
index 00000000000..e45af62220f
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_voxel_texture.osl
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdosl.h"
+
+shader node_voxel_texture(
+	string filename = "",
+	string interpolation = "linear",
+	int use_mapping = 0,
+	matrix mapping = matrix(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
+	point Vector = P,
+	output float Density = 0,
+	output color Color = 0)
+{
+	point p = Vector;
+	if (use_mapping) {
+		p = transform(mapping, p);
+	}
+	else {
+		p = transform("object", Vector);
+		matrix tfm;
+		if (getattribute("geom:generated_transform", tfm))
+			p = transform(tfm, p);
+	}
+	if(p[0] < 0.0 || p[1] < 0.0 || p[2] < 0.0 ||
+	   p[0] > 1.0 || p[1] > 1.0 || p[2] > 1.0)
+	{
+		Density = 0;
+		Color = color(0, 0, 0);
+	}
+	else {
+		Color = (color)texture3d(filename, p, "wrap", "periodic", "interp", interpolation, "alpha", Density);
+	}
+}
diff --git a/intern/cycles/kernel/shaders/node_wave_texture.osl b/intern/cycles/kernel/shaders/node_wave_texture.osl
index ba40207b446..569f284cbac 100644
--- a/intern/cycles/kernel/shaders/node_wave_texture.osl
+++ b/intern/cycles/kernel/shaders/node_wave_texture.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
@@ -31,7 +31,7 @@ float wave(point p, string type, float detail, float distortion, float dscale)
 	}
 
 	if (distortion != 0.0) {
-		n = n + (distortion * noise_turbulence(p * dscale, "Perlin", detail, 0));
+		n = n + (distortion * noise_turbulence(p * dscale, detail, 0));
 	}
 	return 0.5 + 0.5 * sin(n);
 }
diff --git a/intern/cycles/kernel/shaders/node_wavelength.osl b/intern/cycles/kernel/shaders/node_wavelength.osl
index 4333c1fd944..79e7043d4bf 100644
--- a/intern/cycles/kernel/shaders/node_wavelength.osl
+++ b/intern/cycles/kernel/shaders/node_wavelength.osl
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
diff --git a/intern/cycles/kernel/shaders/node_wireframe.osl b/intern/cycles/kernel/shaders/node_wireframe.osl
index db8925c9efc..5cc214495dd 100644
--- a/intern/cycles/kernel/shaders/node_wireframe.osl
+++ b/intern/cycles/kernel/shaders/node_wireframe.osl
@@ -11,17 +11,31 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #include "stdosl.h"
 #include "oslutil.h"
 
 shader node_wireframe(
+	string bump_offset = "center",
 	int use_pixel_size = 0,
 	float Size = 0.01,
 	output float Fac = 0.0)
 {
 	Fac = wireframe("triangles", Size, use_pixel_size);
+	/* TODO(sergey): Since we can't use autodiff here we do algebraic
+	 * calculation of derivatives by definition. We could probably
+	 * optimize this a bit by doing some extra calculation in wireframe().
+	 */
+	if (bump_offset == "dx") {
+		point dx = Dx(P);
+		P -= dx;
+		Fac += (Fac - wireframe("triangles", Size, use_pixel_size)) / length(dx);
+	}
+	else if (bump_offset == "dy") {
+		point dy = Dy(P);
+		P -= dy;
+		Fac += (Fac - wireframe("triangles", Size, use_pixel_size)) / length(dy);
+	}
 }
-
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index f8e5fd510ee..697a1756119 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -249,7 +249,21 @@ point rotate (point p, float angle, point a, point b)
 {
     vector axis = normalize (b - a);
     float cosang, sinang;
+    /* Older OSX has major issues with sincos() function,
+     * it's likely a big in OSL or LLVM. For until we've
+     * updated to new versions of this libraries we'll
+     * use a workaround to prevent possible crashes on all
+     * the platforms.
+     *
+     * Shouldn't be that bad because it's mainly used for
+     * anisotropic shader where angle is usually constant.
+     */
+#if 0
     sincos (angle, sinang, cosang);
+#else
+    sinang = sin (angle);
+    cosang = cos (angle);
+#endif
     float cosang1 = 1.0 - cosang;
     float x = axis[0], y = axis[1], z = axis[2];
     matrix M = matrix (x * x + (1.0 - x * x) * cosang,
@@ -476,8 +490,6 @@ closure color diffuse_ramp(normal N, color colors[8]) BUILTIN;
 closure color phong_ramp(normal N, float exponent, color colors[8]) BUILTIN;
 closure color diffuse_toon(normal N, float size, float smooth) BUILTIN;
 closure color glossy_toon(normal N, float size, float smooth) BUILTIN;
-closure color westin_backscatter(normal N, float roughness) BUILTIN;
-closure color westin_sheen(normal N, float edginess) BUILTIN;
 closure color translucent(normal N) BUILTIN;
 closure color reflection(normal N) BUILTIN;
 closure color refraction(normal N, float eta) BUILTIN;
@@ -507,6 +519,47 @@ closure color hair_transmission(normal N, float roughnessu, float roughnessv, ve
 closure color henyey_greenstein(float g) BUILTIN;
 closure color absorption() BUILTIN;
 
+// OSL 1.5 Microfacet functions
+closure color microfacet(string distribution, normal N, vector U, float xalpha, float yalpha, float eta, int refract) {
+	/* GGX */
+	if (distribution == "ggx" || distribution == "default") {
+		if (!refract) {
+			if (xalpha == yalpha) {
+				/* Isotropic */
+				return microfacet_ggx(N, xalpha);
+			}
+			else {
+				/* Anisotropic */
+				return microfacet_ggx_aniso(N, U, xalpha, yalpha);
+			}
+		}
+		else {
+			return microfacet_ggx_refraction(N, xalpha, eta);
+		}
+	}
+	/* Beckmann */
+	else {
+		if (!refract) {
+			if (xalpha == yalpha) {
+				/* Isotropic */
+				return microfacet_beckmann(N, xalpha);
+			}
+			else {
+				/* Anisotropic */
+				return microfacet_beckmann_aniso(N, U, xalpha, yalpha);
+			}
+		}
+		else {
+			return microfacet_beckmann_refraction(N, xalpha, eta);
+		}
+	}
+}
+
+closure color microfacet (string distribution, normal N, float alpha, float eta, int refract) {
+	return microfacet(distribution, N, vector(0), alpha, alpha, eta, refract);
+}
+
+
 // Renderer state
 int backfacing () BUILTIN;
 int raytype (string typename) BUILTIN;
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h
new file mode 100644
index 00000000000..0132ef9c2f2
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h
@@ -0,0 +1,254 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_background_buffer_update kernel.
+ * This is the fourth kernel in the ray tracing logic, and the third
+ * of the path iteration kernels. This kernel takes care of rays that hit
+ * the background (sceneintersect kernel), and for the rays of
+ * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in
+ * the output buffer. This kernel also takes care of rays that have been determined
+ * to-be-regenerated.
+ *
+ * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel
+ *
+ * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
+ * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state
+ * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop
+ * throughput_coop --------------------------------------|                                      |--- L_transparent_coop
+ * per_sample_output_buffers ----------------------------|                                      |--- per_sample_output_buffers
+ * Ray_coop ---------------------------------------------|                                      |--- ray_state
+ * PathState_coop ---------------------------------------|                                      |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * L_transparent_coop -----------------------------------|                                      |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * ray_state --------------------------------------------|                                      |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----|                                      |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                                      |--- work_array
+ * parallel_samples -------------------------------------|                                      |--- PathState_coop
+ * end_sample -------------------------------------------|                                      |--- throughput_coop
+ * kg (globals + data) ----------------------------------|                                      |--- rng_coop
+ * rng_state --------------------------------------------|                                      |--- Ray
+ * PathRadiance_coop ------------------------------------|                                      |
+ * sw ---------------------------------------------------|                                      |
+ * sh ---------------------------------------------------|                                      |
+ * sx ---------------------------------------------------|                                      |
+ * sy ---------------------------------------------------|                                      |
+ * stride -----------------------------------------------|                                      |
+ * work_array -------------------------------------------|                                      |--- work_array
+ * queuesize --------------------------------------------|                                      |
+ * start_sample -----------------------------------------|                                      |--- work_pool_wgs
+ * work_pool_wgs ----------------------------------------|                                      |
+ * num_samples ------------------------------------------|                                      |
+ *
+ * note on shader_data : shader_data argument is neither an input nor an output for this kernel. It is just filled and consumed here itself.
+ * Note on Queues :
+ * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
+ */
+ccl_device char kernel_background_buffer_update(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,             /* Required for buffer Update */
+        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
+        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
+        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
+        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
+        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
+        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
+        int sw, int sh, int sx, int sy, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
+        int end_sample,
+        int start_sample,
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
+        int ray_index)
+{
+	char enqueue_flag = 0;
+
+	/* Load kernel globals structure and ShaderData strucuture */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+	ShaderData *sd = (ShaderData *)shader_data;
+
+#ifdef __KERNEL_DEBUG__
+	DebugData *debug_data = &debugdata_coop[ray_index];
+#endif
+	ccl_global PathState *state = &PathState_coop[ray_index];
+	PathRadiance *L = L = &PathRadiance_coop[ray_index];
+	ccl_global Ray *ray = &Ray_coop[ray_index];
+	ccl_global float3 *throughput = &throughput_coop[ray_index];
+	ccl_global float *L_transparent = &L_transparent_coop[ray_index];
+	ccl_global uint *rng = &rng_coop[ray_index];
+
+#ifdef __WORK_STEALING__
+	unsigned int my_work;
+	ccl_global float *initial_per_sample_output_buffers;
+	ccl_global uint *initial_rng;
+#endif
+	unsigned int sample;
+	unsigned int tile_x;
+	unsigned int tile_y;
+	unsigned int pixel_x;
+	unsigned int pixel_y;
+	unsigned int my_sample_tile;
+
+#ifdef __WORK_STEALING__
+	my_work = work_array[ray_index];
+	sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+	get_pixel_tile_position(&pixel_x, &pixel_y,
+	                        &tile_x, &tile_y,
+	                        my_work,
+	                        sw, sh, sx, sy,
+	                        parallel_samples,
+	                        ray_index);
+	my_sample_tile = 0;
+	initial_per_sample_output_buffers = per_sample_output_buffers;
+	initial_rng = rng_state;
+#else  /* __WORK_STEALING__ */
+	sample = work_array[ray_index];
+	int tile_index = ray_index / parallel_samples;
+	/* buffer and rng_state's stride is "stride". Find x and y using ray_index */
+	tile_x = tile_index % sw;
+	tile_y = tile_index / sw;
+	my_sample_tile = ray_index - (tile_index * parallel_samples);
+#endif  /* __WORK_STEALING__ */
+
+	rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+	per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		/* eval background shader if nothing hit */
+		if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
+			*L_transparent = (*L_transparent) + average((*throughput));
+#ifdef __PASSES__
+			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+#ifdef __BACKGROUND__
+			/* sample background shader */
+			float3 L_background = indirect_background(kg, state, ray, sd);
+			path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
+#endif
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+		}
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		float3 L_sum = path_radiance_clamp_and_sum(kg, L);
+		kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
+#ifdef __KERNEL_DEBUG__
+		kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
+#endif
+		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
+
+		/* accumulate result in output buffer */
+		kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+		path_rng_end(kg, rng_state, *rng);
+
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+#ifdef __WORK_STEALING__
+		/* We have completed current work; So get next work */
+		int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+		if(!valid_work) {
+			/* If work is invalid, this means no more work is available and the thread may exit */
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+		}
+#else  /* __WORK_STEALING__ */
+		if((sample + parallel_samples) >= end_sample) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+		}
+#endif  /* __WORK_STEALING__ */
+
+		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+#ifdef __WORK_STEALING__
+			work_array[ray_index] = my_work;
+			/* Get the sample associated with the current work */
+			sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+			/* Get pixel and tile position associated with current work */
+			get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+			my_sample_tile = 0;
+
+			/* Remap rng_state according to the current work */
+			rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
+			/* Remap per_sample_output_buffers according to the current work */
+			per_sample_output_buffers = initial_per_sample_output_buffers
+				+ (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+#else  /* __WORK_STEALING__ */
+			work_array[ray_index] = sample + parallel_samples;
+			sample = work_array[ray_index];
+
+			/* Get ray position from ray index */
+			pixel_x = sx + ((ray_index / parallel_samples) % sw);
+			pixel_y = sy + ((ray_index / parallel_samples) / sw);
+#endif  /* __WORK_STEALING__ */
+
+			/* Initialize random numbers and ray. */
+			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
+
+			if(ray->t != 0.0f) {
+				/* Initialize throughput, L_transparent, Ray, PathState;
+				 * These rays proceed with path-iteration.
+				 */
+				*throughput = make_float3(1.0f, 1.0f, 1.0f);
+				*L_transparent = 0.0f;
+				path_radiance_init(L, kernel_data.film.use_light_pass);
+				path_state_init(kg, state, rng, sample, ray);
+#ifdef __KERNEL_DEBUG__
+				debug_data_init(debug_data);
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				enqueue_flag = 1;
+			} else {
+				/* These rays do not participate in path-iteration. */
+				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				/* Accumulate result in output buffer. */
+				kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+				path_rng_end(kg, rng_state, *rng);
+
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+			}
+		}
+	}
+	return enqueue_flag;
+}
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
new file mode 100644
index 00000000000..4dab79a5c67
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -0,0 +1,415 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_data_initialization kernel
+ * This kernel Initializes structures needed in path-iteration kernels.
+ * This is the first kernel in ray-tracing logic.
+ *
+ * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
+ *
+ * Its input and output are as follows,
+ *
+ * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng
+ * Un-initialized throughput -------|                                  |--- Initialized throughput
+ * Un-initialized L_transparent ----|                                  |--- Initialized L_transparent
+ * Un-initialized PathRadiance -----|                                  |--- Initialized PathRadiance
+ * Un-initialized Ray --------------|                                  |--- Initialized Ray
+ * Un-initialized PathState --------|                                  |--- Initialized PathState
+ * Un-initialized QueueData --------|                                  |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
+ * Un-initialized QueueIndex -------|                                  |--- Initialized QueueIndex (to 0)
+ * Un-initialized use_queues_flag---|                                  |--- Initialized use_queues_flag (to false)
+ * Un-initialized ray_state --------|                                  |--- Initialized ray_state
+ * parallel_samples --------------- |                                  |--- Initialized per_sample_output_buffers
+ * rng_state -----------------------|                                  |--- Initialized work_array
+ * data ----------------------------|                                  |--- Initialized work_pool_wgs
+ * start_sample --------------------|                                  |
+ * sx ------------------------------|                                  |
+ * sy ------------------------------|                                  |
+ * sw ------------------------------|                                  |
+ * sh ------------------------------|                                  |
+ * stride --------------------------|                                  |
+ * queuesize -----------------------|                                  |
+ * num_samples ---------------------|                                  |
+ *
+ * Note on Queues :
+ * All slots in queues are initialized to queue empty slot;
+ * The number of elements in the queues is initialized to 0;
+ */
+ccl_device void kernel_data_init(
+        ccl_global char *globals,
+        ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
+        ccl_global char *shader_data_sd_DL_shadow,        /* Arguments related to ShaderData */
+
+        ccl_global float3 *P_sd,
+        ccl_global float3 *P_sd_DL_shadow,
+
+        ccl_global float3 *N_sd,
+        ccl_global float3 *N_sd_DL_shadow,
+
+        ccl_global float3 *Ng_sd,
+        ccl_global float3 *Ng_sd_DL_shadow,
+
+        ccl_global float3 *I_sd,
+        ccl_global float3 *I_sd_DL_shadow,
+
+        ccl_global int *shader_sd,
+        ccl_global int *shader_sd_DL_shadow,
+
+        ccl_global int *flag_sd,
+        ccl_global int *flag_sd_DL_shadow,
+
+        ccl_global int *prim_sd,
+        ccl_global int *prim_sd_DL_shadow,
+
+        ccl_global int *type_sd,
+        ccl_global int *type_sd_DL_shadow,
+
+        ccl_global float *u_sd,
+        ccl_global float *u_sd_DL_shadow,
+
+        ccl_global float *v_sd,
+        ccl_global float *v_sd_DL_shadow,
+
+        ccl_global int *object_sd,
+        ccl_global int *object_sd_DL_shadow,
+
+        ccl_global float *time_sd,
+        ccl_global float *time_sd_DL_shadow,
+
+        ccl_global float *ray_length_sd,
+        ccl_global float *ray_length_sd_DL_shadow,
+
+        ccl_global int *ray_depth_sd,
+        ccl_global int *ray_depth_sd_DL_shadow,
+
+        ccl_global int *transparent_depth_sd,
+        ccl_global int *transparent_depth_sd_DL_shadow,
+
+        /* Ray differentials. */
+        ccl_global differential3 *dP_sd,
+        ccl_global differential3 *dP_sd_DL_shadow,
+
+        ccl_global differential3 *dI_sd,
+        ccl_global differential3 *dI_sd_DL_shadow,
+
+        ccl_global differential *du_sd,
+        ccl_global differential *du_sd_DL_shadow,
+
+        ccl_global differential *dv_sd,
+        ccl_global differential *dv_sd_DL_shadow,
+
+        /* Dp/Du */
+        ccl_global float3 *dPdu_sd,
+        ccl_global float3 *dPdu_sd_DL_shadow,
+
+        ccl_global float3 *dPdv_sd,
+        ccl_global float3 *dPdv_sd_DL_shadow,
+
+        /* Object motion. */
+        ccl_global Transform *ob_tfm_sd,
+        ccl_global Transform *ob_tfm_sd_DL_shadow,
+
+        ccl_global Transform *ob_itfm_sd,
+        ccl_global Transform *ob_itfm_sd_DL_shadow,
+
+        ShaderClosure *closure_sd,
+        ShaderClosure *closure_sd_DL_shadow,
+
+        ccl_global int *num_closure_sd,
+        ccl_global int *num_closure_sd_DL_shadow,
+
+        ccl_global float *randb_closure_sd,
+        ccl_global float *randb_closure_sd_DL_shadow,
+
+        ccl_global float3 *ray_P_sd,
+        ccl_global float3 *ray_P_sd_DL_shadow,
+
+        ccl_global differential3 *ray_dP_sd,
+        ccl_global differential3 *ray_dP_sd_DL_shadow,
+
+        ccl_constant KernelData *data,
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_state,
+        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
+        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
+        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
+        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
+        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
+        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
+        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
+
+#define KERNEL_TEX(type, ttype, name)                                   \
+        ccl_global type *name,
+#include "../kernel_textures.h"
+
+        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+        int rng_state_offset_x,
+        int rng_state_offset_y,
+        int rng_state_stride,
+        ccl_global int *Queue_data,                  /* Memory for queues */
+        ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
+        int queuesize,                               /* size (capacity) of the queue */
+        ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
+#ifdef __WORK_STEALING__
+        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
+        unsigned int num_samples,                    /* Total number of samples per pixel */
+#endif
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples)                        /* Number of samples to be processed in parallel */
+{
+
+	/* Load kernel globals structure */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+
+	kg->data = data;
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "../kernel_textures.h"
+
+	/* Load ShaderData structure */
+	ShaderData *sd = (ShaderData *)shader_data_sd;
+	ShaderData *sd_DL_shadow = (ShaderData *)shader_data_sd_DL_shadow;
+
+	sd->P = P_sd;
+	sd_DL_shadow->P = P_sd_DL_shadow;
+
+	sd->N = N_sd;
+	sd_DL_shadow->N = N_sd_DL_shadow;
+
+	sd->Ng = Ng_sd;
+	sd_DL_shadow->Ng = Ng_sd_DL_shadow;
+
+	sd->I = I_sd;
+	sd_DL_shadow->I = I_sd_DL_shadow;
+
+	sd->shader = shader_sd;
+	sd_DL_shadow->shader = shader_sd_DL_shadow;
+
+	sd->flag = flag_sd;
+	sd_DL_shadow->flag = flag_sd_DL_shadow;
+
+	sd->prim = prim_sd;
+	sd_DL_shadow->prim = prim_sd_DL_shadow;
+
+	sd->type = type_sd;
+	sd_DL_shadow->type = type_sd_DL_shadow;
+
+	sd->u = u_sd;
+	sd_DL_shadow->u = u_sd_DL_shadow;
+
+	sd->v = v_sd;
+	sd_DL_shadow->v = v_sd_DL_shadow;
+
+	sd->object = object_sd;
+	sd_DL_shadow->object = object_sd_DL_shadow;
+
+	sd->time = time_sd;
+	sd_DL_shadow->time = time_sd_DL_shadow;
+
+	sd->ray_length = ray_length_sd;
+	sd_DL_shadow->ray_length = ray_length_sd_DL_shadow;
+
+	sd->ray_depth = ray_depth_sd;
+	sd_DL_shadow->ray_depth = ray_depth_sd_DL_shadow;
+
+	sd->transparent_depth = transparent_depth_sd;
+	sd_DL_shadow->transparent_depth = transparent_depth_sd_DL_shadow;
+
+#ifdef __RAY_DIFFERENTIALS__
+	sd->dP = dP_sd;
+	sd_DL_shadow->dP = dP_sd_DL_shadow;
+
+	sd->dI = dI_sd;
+	sd_DL_shadow->dI = dI_sd_DL_shadow;
+
+	sd->du = du_sd;
+	sd_DL_shadow->du = du_sd_DL_shadow;
+
+	sd->dv = dv_sd;
+	sd_DL_shadow->dv = dv_sd_DL_shadow;
+#ifdef __DPDU__
+	sd->dPdu = dPdu_sd;
+	sd_DL_shadow->dPdu = dPdu_sd_DL_shadow;
+
+	sd->dPdv = dPdv_sd;
+	sd_DL_shadow->dPdv = dPdv_sd_DL_shadow;
+#endif
+#endif
+
+#ifdef __OBJECT_MOTION__
+	sd->ob_tfm = ob_tfm_sd;
+	sd_DL_shadow->ob_tfm = ob_tfm_sd_DL_shadow;
+
+	sd->ob_itfm = ob_itfm_sd;
+	sd_DL_shadow->ob_itfm = ob_itfm_sd_DL_shadow;
+#endif
+
+	sd->closure = closure_sd;
+	sd_DL_shadow->closure = closure_sd_DL_shadow;
+
+	sd->num_closure = num_closure_sd;
+	sd_DL_shadow->num_closure = num_closure_sd_DL_shadow;
+
+	sd->randb_closure = randb_closure_sd;
+	sd_DL_shadow->randb_closure = randb_closure_sd_DL_shadow;
+
+	sd->ray_P = ray_P_sd;
+	sd_DL_shadow->ray_P = ray_P_sd_DL_shadow;
+
+	sd->ray_dP = ray_dP_sd;
+	sd_DL_shadow->ray_dP = ray_dP_sd_DL_shadow;
+
+	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+#ifdef __WORK_STEALING__
+	int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	/* Initialize work_pool_wgs */
+	if(lid == 0) {
+		int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
+		work_pool_wgs[group_index] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+#endif  /* __WORK_STEALING__ */
+
+	/* Initialize queue data and queue index. */
+	if(thread_index < queuesize) {
+		/* Initialize active ray queue. */
+		Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize background and buffer update queue. */
+		Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize shadow ray cast of AO queue. */
+		Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		/* Initialize shadow ray cast of direct lighting queue. */
+		Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+	}
+
+	if(thread_index == 0) {
+		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+		/* The scene-intersect kernel should not use the queues very first time.
+		 * since the queue would be empty.
+		 */
+		use_queues_flag[0] = 0;
+	}
+
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	if(x < (sw * parallel_samples) && y < sh) {
+		int ray_index = x + y * (sw * parallel_samples);
+
+		/* This is the first assignment to ray_state;
+		 * So we dont use ASSIGN_RAY_STATE macro.
+		 */
+		ray_state[ray_index] = RAY_ACTIVE;
+
+		unsigned int my_sample;
+		unsigned int pixel_x;
+		unsigned int pixel_y;
+		unsigned int tile_x;
+		unsigned int tile_y;
+		unsigned int my_sample_tile;
+
+#ifdef __WORK_STEALING__
+		unsigned int my_work = 0;
+		/* Get work. */
+		get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+		/* Get the sample associated with the work. */
+		my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+
+		my_sample_tile = 0;
+
+		/* Get pixel and tile position associated with the work. */
+		get_pixel_tile_position(&pixel_x, &pixel_y,
+		                        &tile_x, &tile_y,
+		                        my_work,
+		                        sw, sh, sx, sy,
+		                        parallel_samples,
+		                        ray_index);
+		work_array[ray_index] = my_work;
+#else  /* __WORK_STEALING__ */
+		unsigned int tile_index = ray_index / parallel_samples;
+		tile_x = tile_index % sw;
+		tile_y = tile_index / sw;
+		my_sample_tile = ray_index - (tile_index * parallel_samples);
+		my_sample = my_sample_tile + start_sample;
+
+		/* Initialize work array. */
+		work_array[ray_index] = my_sample ;
+
+		/* Calculate pixel position of this ray. */
+		pixel_x = sx + tile_x;
+		pixel_y = sy + tile_y;
+#endif  /* __WORK_STEALING__ */
+
+		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+
+		/* Initialise per_sample_output_buffers to all zeros. */
+		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
+		int per_sample_output_buffers_iterator = 0;
+		for(per_sample_output_buffers_iterator = 0;
+		    per_sample_output_buffers_iterator < kernel_data.film.pass_stride;
+		    per_sample_output_buffers_iterator++)
+		{
+			per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
+		}
+
+		/* Initialize random numbers and ray. */
+		kernel_path_trace_setup(kg,
+		                        rng_state,
+		                        my_sample,
+		                        pixel_x, pixel_y,
+		                        &rng_coop[ray_index],
+		                        &Ray_coop[ray_index]);
+
+		if(Ray_coop[ray_index].t != 0.0f) {
+			/* Initialize throughput, L_transparent, Ray, PathState;
+			 * These rays proceed with path-iteration.
+			 */
+			throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
+			L_transparent_coop[ray_index] = 0.0f;
+			path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
+			path_state_init(kg,
+			                &PathState_coop[ray_index],
+			                &rng_coop[ray_index],
+			                my_sample,
+			                &Ray_coop[ray_index]);
+#ifdef __KERNEL_DEBUG__
+			debug_data_init(&debugdata_coop[ray_index]);
+#endif
+		} else {
+			/* These rays do not participate in path-iteration. */
+			float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+			/* Accumulate result in output buffer. */
+			kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
+			path_rng_end(kg, rng_state, rng_coop[ray_index]);
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+		}
+	}
+
+	/* Mark rest of the ray-state indices as RAY_INACTIVE. */
+	if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
+		/* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
+		ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
new file mode 100644
index 00000000000..50c83d06140
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_direct_lighting kernel.
+ * This is the eighth kernel in the ray tracing logic. This is the seventh
+ * of the path iteration kernels. This kernel takes care of direct lighting
+ * logic. However, the "shadow ray cast" part of direct lighting is handled
+ * in the next kernel.
+ *
+ * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed.
+ * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and
+ * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop
+ * PathState_coop -----------------------------------|                             |--- ISLamp_coop
+ * shader_data --------------------------------------|                             |--- LightRay_coop
+ * ray_state ----------------------------------------|                             |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                             |
+ * kg (globals + data) ------------------------------|                             |
+ * queuesize ----------------------------------------|                             |
+ *
+ * note on shader_DL : shader_DL is neither input nor output to this kernel; shader_DL is filled and consumed in this kernel itself.
+ * Note on Queues :
+ * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
+ * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked
+ * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag.
+ *
+ * State of queues when this kernel is called :
+ * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
+ * before and after this kernel call.
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
+ * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
+ */
+ccl_device char kernel_direct_lighting(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,           /* Required for direct lighting */
+        ccl_global char *shader_DL,             /* Required for direct lighting */
+        ccl_global uint *rng_coop,              /* Required for direct lighting */
+        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
+        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
+        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
+        ccl_global char *ray_state,             /* Denotes the state of each ray */
+        int ray_index)
+{
+	char enqueue_flag = 0;
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		/* Load kernel globals structure and ShaderData structure. */
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		ShaderData *sd_DL  = (ShaderData *)shader_DL;
+
+		ccl_global PathState *state = &PathState_coop[ray_index];
+
+		/* direct lighting */
+#ifdef __EMISSION__
+		if((kernel_data.integrator.use_direct_light &&
+		    (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
+		{
+			/* Sample illumination from lights to find path contribution. */
+			ccl_global RNG* rng = &rng_coop[ray_index];
+			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+			float light_u, light_v;
+			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+			LightSample ls;
+			light_sample(kg,
+			             light_t, light_u, light_v,
+			             ccl_fetch(sd, time),
+			             ccl_fetch(sd, P),
+			             state->bounce,
+			             &ls);
+
+			Ray light_ray;
+#ifdef __OBJECT_MOTION__
+			light_ray.time = ccl_fetch(sd, time);
+#endif
+
+			BsdfEval L_light;
+			bool is_lamp;
+			if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp,
+			                   state->bounce, state->transparent_bounce, sd_DL))
+			{
+				/* Write intermediate data to global memory to access from
+				 * the next kernel.
+				 */
+				LightRay_coop[ray_index] = light_ray;
+				BSDFEval_coop[ray_index] = L_light;
+				ISLamp_coop[ray_index] = is_lamp;
+				/* Mark ray state for next shadow kernel. */
+				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+				enqueue_flag = 1;
+			}
+		}
+#endif  /* __EMISSION__ */
+	}
+	return enqueue_flag;
+}
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
new file mode 100644
index 00000000000..a75523a3e53
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -0,0 +1,264 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
+ * This is the sixth kernel in the ray tracing logic. This is the fifth
+ * of the path iteration kernels. This kernel takes care of the logic to process
+ * "material of type holdout", indirect primitive emission, bsdf blurring,
+ * probabilistic path termination and AO.
+ *
+ * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed.
+ * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and
+ * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS
+ *
+ * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * throughput_coop --------------------------------------|                                                           |--- PathState_coop
+ * PathRadiance_coop ------------------------------------|                                                           |--- throughput_coop
+ * Intersection_coop ------------------------------------|                                                           |--- L_transparent_coop
+ * PathState_coop ---------------------------------------|                                                           |--- per_sample_output_buffers
+ * L_transparent_coop -----------------------------------|                                                           |--- PathRadiance_coop
+ * shader_data ------------------------------------------|                                                           |--- ShaderData
+ * ray_state --------------------------------------------|                                                           |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------|                                                           |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                           |--- AOAlpha_coop
+ * kg (globals + data) ----------------------------------|                                                           |--- AOBSDF_coop
+ * parallel_samples -------------------------------------|                                                           |--- AOLightRay_coop
+ * per_sample_output_buffers ----------------------------|                                                           |
+ * sw ---------------------------------------------------|                                                           |
+ * sh ---------------------------------------------------|                                                           |
+ * sx ---------------------------------------------------|                                                           |
+ * sy ---------------------------------------------------|                                                           |
+ * stride -----------------------------------------------|                                                           |
+ * work_array -------------------------------------------|                                                           |
+ * queuesize --------------------------------------------|                                                           |
+ * start_sample -----------------------------------------|                                                           |
+ *
+ * Note on Queues :
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
+ * the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER
+ * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
+ * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
+ * changed to RAY_UPDATE_BUFFER, there is no problem.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
+ */
+ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required throughout the kernel except probabilistic path termination and AO */
+        ccl_global float *per_sample_output_buffers,
+        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
+        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
+        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
+        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
+        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
+        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
+        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
+        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
+        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
+        int sw, int sh, int sx, int sy, int stride,
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
+#ifdef __WORK_STEALING__
+        unsigned int start_sample,
+#endif
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
+        int ray_index,
+        char *enqueue_flag,
+        char *enqueue_flag_AO_SHADOW_RAY_CAST)
+{
+	/* Load kernel globals structure and ShaderData structure */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+	ShaderData *sd = (ShaderData *)shader_data;
+
+#ifdef __WORK_STEALING__
+	unsigned int my_work;
+	unsigned int pixel_x;
+	unsigned int pixel_y;
+#endif
+	unsigned int tile_x;
+	unsigned int tile_y;
+	int my_sample_tile;
+	unsigned int sample;
+
+	ccl_global RNG *rng = 0x0;
+	ccl_global PathState *state = 0x0;
+	float3 throughput;
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+		throughput = throughput_coop[ray_index];
+		state = &PathState_coop[ray_index];
+		rng = &rng_coop[ray_index];
+#ifdef __WORK_STEALING__
+		my_work = work_array[ray_index];
+		sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+		get_pixel_tile_position(&pixel_x, &pixel_y,
+		                        &tile_x, &tile_y,
+		                        my_work,
+		                        sw, sh, sx, sy,
+		                        parallel_samples,
+		                        ray_index);
+		my_sample_tile = 0;
+#else  /* __WORK_STEALING__ */
+		sample = work_array[ray_index];
+		/* Buffer's stride is "stride"; Find x and y using ray_index. */
+		int tile_index = ray_index / parallel_samples;
+		tile_x = tile_index % sw;
+		tile_y = tile_index / sw;
+		my_sample_tile = ray_index - (tile_index * parallel_samples);
+#endif  /* __WORK_STEALING__ */
+		per_sample_output_buffers +=
+		    (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) *
+		    kernel_data.film.pass_stride;
+
+		/* holdout */
+#ifdef __HOLDOUT__
+		if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) &&
+		   (state->flag & PATH_RAY_CAMERA))
+		{
+			if(kernel_data.background.transparent) {
+				float3 holdout_weight;
+
+				if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK)
+					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+				else
+					holdout_weight = shader_holdout_eval(kg, sd);
+
+				/* any throughput is ok, should all be identical here */
+				L_transparent_coop[ray_index] += average(holdout_weight*throughput);
+			}
+
+			if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) {
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				*enqueue_flag = 1;
+			}
+		}
+#endif  /* __HOLDOUT__ */
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		PathRadiance *L = &PathRadiance_coop[ray_index];
+		/* Holdout mask objects do not write data passes. */
+		kernel_write_data_passes(kg,
+		                         per_sample_output_buffers,
+		                         L,
+		                         sd,
+		                         sample,
+		                         state,
+		                         throughput);
+		/* Blurring of bsdf after bounces, for rays that have a small likelihood
+		 * of following this particular path (diffuse, rough glossy.
+		 */
+		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
+			if(blur_pdf < 1.0f) {
+				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
+				shader_bsdf_blur(kg, sd, blur_roughness);
+			}
+		}
+
+#ifdef __EMISSION__
+		/* emission */
+		if(ccl_fetch(sd, flag) & SD_EMISSION) {
+			/* TODO(sergey): is isect.t wrong here for transparent surfaces? */
+			float3 emission = indirect_primitive_emission(
+			        kg,
+			        sd,
+			        Intersection_coop[ray_index].t,
+			        state->flag,
+			        state->ray_pdf);
+			path_radiance_accum_emission(L, throughput, emission, state->bounce);
+		}
+#endif  /* __EMISSION__ */
+
+		/* Path termination. this is a strange place to put the termination, it's
+		 * mainly due to the mixed in MIS that we use. gives too many unneeded
+		 * shader evaluations, only need emission if we are going to terminate.
+		 */
+		float probability = path_state_terminate_probability(kg, state, throughput);
+
+		if(probability == 0.0f) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			*enqueue_flag = 1;
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+			if(probability != 1.0f) {
+				float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+				if(terminate >= probability) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+					*enqueue_flag = 1;
+				} else {
+					throughput_coop[ray_index] = throughput/probability;
+				}
+			}
+		}
+	}
+
+#ifdef __AO__
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		/* ambient occlusion */
+		if(kernel_data.integrator.use_ambient_occlusion ||
+		   (ccl_fetch(sd, flag) & SD_AO))
+		{
+			/* todo: solve correlation */
+			float bsdf_u, bsdf_v;
+			path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+			float ao_factor = kernel_data.background.ao_factor;
+			float3 ao_N;
+			AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+			AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
+
+			float3 ao_D;
+			float ao_pdf;
+			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+			if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+				Ray _ray;
+				_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+				_ray.D = ao_D;
+				_ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+				_ray.time = ccl_fetch(sd, time);
+#endif
+				_ray.dP = ccl_fetch(sd, dP);
+				_ray.dD = differential3_zero();
+				AOLightRay_coop[ray_index] = _ray;
+
+				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+				*enqueue_flag_AO_SHADOW_RAY_CAST = 1;
+			}
+		}
+	}
+#endif  /* __AO__ */
+}
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
new file mode 100644
index 00000000000..a8e4b0a06c8
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_lamp_emission
+ * This is the 3rd kernel in the ray-tracing logic. This is the second of the
+ * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
+ * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
+ * and RAY_HIT_BACKGROUND.
+ * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
+ * The input/output of the kernel is as follows,
+ * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop
+ * Ray_coop -------------------------------------------|                           |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * PathState_coop -------------------------------------|                           |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * kg (globals + data) --------------------------------|                           |
+ * Intersection_coop ----------------------------------|                           |
+ * ray_state ------------------------------------------|                           |
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----|                           |
+ * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----|                           |
+ * queuesize ------------------------------------------|                           |
+ * use_queues_flag ------------------------------------|                           |
+ * sw -------------------------------------------------|                           |
+ * sh -------------------------------------------------|                           |
+ * parallel_samples -----------------------------------|                           |
+ *
+ * note : shader_data is neither input nor output. Its just filled and consumed in the same, kernel_lamp_emission, kernel.
+ */
+ccl_device void kernel_lamp_emission(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Required for lamp emission */
+        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
+        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
+        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
+        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
+        Intersection *Intersection_coop,       /* Required for lamp emission */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
+                                                * queues to fetch ray index
+                                                */
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
+        int ray_index)
+{
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
+	{
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		PathRadiance *L = &PathRadiance_coop[ray_index];
+
+		float3 throughput = throughput_coop[ray_index];
+		Ray ray = Ray_coop[ray_index];
+		PathState state = PathState_coop[ray_index];
+
+#ifdef __LAMP_MIS__
+		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
+			/* ray starting from previous non-transparent bounce */
+			Ray light_ray;
+
+			light_ray.P = ray.P - state.ray_t*ray.D;
+			state.ray_t += Intersection_coop[ray_index].t;
+			light_ray.D = ray.D;
+			light_ray.t = state.ray_t;
+			light_ray.time = ray.time;
+			light_ray.dD = ray.dD;
+			light_ray.dP = ray.dP;
+			/* intersect with lamp */
+			float3 emission;
+
+			if(indirect_lamp_emission(kg, &state, &light_ray, &emission, sd)) {
+				path_radiance_accum_emission(L, throughput, emission, state.bounce);
+			}
+		}
+#endif  /* __LAMP_MIS__ */
+
+		/* __VOLUME__ feature is disabled */
+#if 0
+#ifdef __VOLUME__
+		/* volume attenuation, emission, scatter */
+		if(state.volume_stack[0].shader != SHADER_NONE) {
+			Ray volume_ray = ray;
+			volume_ray.t = (hit)? isect.t: FLT_MAX;
+
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+
+#ifdef __VOLUME_DECOUPLED__
+			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
+
+			if(decoupled) {
+				/* cache steps along volume for repeated sampling */
+				VolumeSegment volume_segment;
+				ShaderData volume_sd;
+
+				shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+				kernel_volume_decoupled_record(kg, &state,
+					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+				volume_segment.sampling_method = sampling_method;
+
+				/* emission */
+				if(volume_segment.closure_flag & SD_EMISSION)
+					path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+
+				/* scattering */
+				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+				if(volume_segment.closure_flag & SD_SCATTER) {
+					bool all = false;
+
+					/* direct light sampling */
+					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+						throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
+
+					/* indirect sample. if we use distance sampling and take just
+					 * one sample for direct and indirect light, we could share
+					 * this computation, but makes code a bit complex */
+					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+
+					result = kernel_volume_decoupled_scatter(kg,
+						&state, &volume_ray, &volume_sd, &throughput,
+						rphase, rscatter, &volume_segment, NULL, true);
+				}
+
+				if(result != VOLUME_PATH_SCATTERED)
+					throughput *= volume_segment.accum_transmittance;
+
+				/* free cached steps */
+				kernel_volume_decoupled_free(kg, &volume_segment);
+
+				if(result == VOLUME_PATH_SCATTERED) {
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+						continue;
+					else
+						break;
+				}
+			}
+			else
+#endif  /* __VOLUME_DECOUPLED__ */
+			{
+				/* integrate along volume segment with distance sampling */
+				ShaderData volume_sd;
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+
+#ifdef __VOLUME_SCATTER__
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* direct lighting */
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L);
+
+					/* indirect light bounce */
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+						continue;
+					else
+						break;
+				}
+#endif  /* __VOLUME_SCATTER__ */
+			}
+		}
+#endif  /* __VOLUME__ */
+#endif
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
new file mode 100644
index 00000000000..e1a1577d7ae
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_setup_next_iteration kernel.
+ * This is the tenth kernel in the ray tracing logic. This is the ninth
+ * of the path iteration kernels. This kernel takes care of setting up
+ * Ray for the next iteration of path-iteration and accumulating radiance
+ * corresponding to AO and direct-lighting
+ *
+ * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * throughput_coop --------------------------------------|                                 |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * PathRadiance_coop ------------------------------------|                                 |--- throughput_coop
+ * PathState_coop ---------------------------------------|                                 |--- PathRadiance_coop
+ * shader_data ------------------------------------------|                                 |--- PathState_coop
+ * ray_state --------------------------------------------|                                 |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------|                                 |--- Ray_coop
+ * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                 |--- use_queues_flag
+ * Ray_coop ---------------------------------------------|                                 |
+ * kg (globals + data) ----------------------------------|                                 |
+ * LightRay_dl_coop -------------------------------------|
+ * ISLamp_coop ------------------------------------------|
+ * BSDFEval_coop ----------------------------------------|
+ * LightRay_ao_coop -------------------------------------|
+ * AOBSDF_coop ------------------------------------------|
+ * AOAlpha_coop -----------------------------------------|
+ *
+ * Note on queues,
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
+ * the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF
+ * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
+ * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
+ * changed to RAY_UPDATE_BUFF, there is no problem.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
+ */
+ccl_device char kernel_next_iteration_setup(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,         /* Required for setting up ray for next iteration */
+        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
+        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
+        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
+        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
+        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
+        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
+        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
+        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
+        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
+        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
+        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
+        ccl_global char *ray_state,           /* Denotes the state of each ray */
+        ccl_global char *use_queues_flag,     /* flag to decide if scene_intersect kernel should
+                                               * use queues to fetch ray index */
+        int ray_index)
+{
+	char enqueue_flag = 0;
+
+	/* Load kernel globals structure and ShaderData structure. */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+	ShaderData *sd = (ShaderData *)shader_data;
+	PathRadiance *L = 0x0;
+	ccl_global PathState *state = 0x0;
+
+	/* Path radiance update for AO/Direct_lighting's shadow blocked. */
+	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
+	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
+	 {
+		state = &PathState_coop[ray_index];
+		L = &PathRadiance_coop[ray_index];
+		float3 _throughput = throughput_coop[ray_index];
+
+		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+			float3 shadow = LightRay_ao_coop[ray_index].P;
+			char update_path_radiance = LightRay_ao_coop[ray_index].t;
+			if(update_path_radiance) {
+				path_radiance_accum_ao(L,
+				                       _throughput,
+				                       AOAlpha_coop[ray_index],
+				                       AOBSDF_coop[ray_index],
+				                       shadow,
+				                       state->bounce);
+			}
+			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+		}
+
+		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
+			float3 shadow = LightRay_dl_coop[ray_index].P;
+			char update_path_radiance = LightRay_dl_coop[ray_index].t;
+			if(update_path_radiance) {
+				BsdfEval L_light = BSDFEval_coop[ray_index];
+				path_radiance_accum_light(L,
+				                          _throughput,
+				                          &L_light,
+				                          shadow,
+				                          1.0f,
+				                          state->bounce,
+				                          ISLamp_coop[ray_index]);
+			}
+			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+		}
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		ccl_global float3 *throughput = &throughput_coop[ray_index];
+		ccl_global Ray *ray = &Ray_coop[ray_index];
+		ccl_global RNG* rng = &rng_coop[ray_index];
+		state = &PathState_coop[ray_index];
+		L = &PathRadiance_coop[ray_index];
+
+		/* Compute direct lighting and next bounce. */
+		if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+			enqueue_flag = 1;
+		}
+	}
+
+	return enqueue_flag;
+}
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
new file mode 100644
index 00000000000..7eb201ecf32
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_scene_intersect kernel.
+ * This is the second kernel in the ray tracing logic. This is the first
+ * of the path iteration kernels. This kernel takes care of scene_intersect function.
+ *
+ * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
+ * This kernel processes rays of ray state RAY_ACTIVE
+ * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND.
+ *
+ * The input and output are as follows,
+ *
+ * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState
+ * PathState_coop ---------------------------------|                                          |--- Intersection
+ * ray_state --------------------------------------|                                          |--- ray_state
+ * use_queues_flag --------------------------------|                                          |
+ * parallel_samples -------------------------------|                                          |
+ * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                                          |
+ * kg (data + globals) ----------------------------|                                          |
+ * rng_coop ---------------------------------------|                                          |
+ * sw ---------------------------------------------|                                          |
+ * sh ---------------------------------------------|                                          |
+ * queuesize --------------------------------------|                                          |
+ *
+ * Note on Queues :
+ * Ideally we would want kernel_scene_intersect to work on queues.
+ * But during the very first time, the queues will be empty and hence we perform a direct mapping
+ * between ray-index and thread-index; From the next time onward, the queue will be filled and
+ * we may start operating on queues.
+ *
+ * State of queue during the first time this kernel is called :
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel
+ *
+ * State of queues during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays;
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ;
+ * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These
+ * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing
+ * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays
+ * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues)
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and
+ * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
+ */
+
+ccl_device void kernel_scene_intersect(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global uint *rng_coop,
+        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
+        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
+        Intersection *Intersection_coop,       /* Required for scene_intersect */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int sw, int sh,
+        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
+                                                * queues to fetch ray index */
+#ifdef __KERNEL_DEBUG__
+        DebugData *debugdata_coop,
+#endif
+        int parallel_samples,                  /* Number of samples to be processed in parallel */
+        int ray_index)
+{
+	/* All regenerated rays become active here */
+	if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
+
+	if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE))
+		return;
+
+	/* Load kernel globals structure */
+	KernelGlobals *kg = (KernelGlobals *)globals;
+
+#ifdef __KERNEL_DEBUG__
+	DebugData *debug_data = &debugdata_coop[ray_index];
+#endif
+	Intersection *isect = &Intersection_coop[ray_index];
+	PathState state = PathState_coop[ray_index];
+	Ray ray = Ray_coop[ray_index];
+
+	/* intersect scene */
+	uint visibility = path_state_ray_visibility(kg, &state);
+
+#ifdef __HAIR__
+	float difl = 0.0f, extmax = 0.0f;
+	uint lcg_state = 0;
+	RNG rng = rng_coop[ray_index];
+
+	if(kernel_data.bvh.have_curves) {
+		if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
+			float3 pixdiff = ray.dD.dx + ray.dD.dy;
+			/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
+			difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+		}
+
+		extmax = kernel_data.curve.maximum_width;
+		lcg_state = lcg_state_init(&rng, &state, 0x51633e2d);
+	}
+
+	bool hit = scene_intersect(kg, &ray, visibility, isect, &lcg_state, difl, extmax);
+#else
+	bool hit = scene_intersect(kg, &ray, visibility, isect, NULL, 0.0f, 0.0f);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+	if(state.flag & PATH_RAY_CAMERA) {
+		debug_data->num_bvh_traversal_steps += isect->num_traversal_steps;
+		debug_data->num_bvh_traversed_instances += isect->num_traversed_instances;
+	}
+	debug_data->num_ray_bounces++;
+#endif
+
+	if(!hit) {
+		/* Change the state of rays that hit the background;
+		 * These rays undergo special processing in the
+		 * background_bufferUpdate kernel.
+		 */
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
new file mode 100644
index 00000000000..e6fdc592586
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_shader_eval kernel
+ * This kernel is the 5th kernel in the ray tracing logic. This is
+ * the 4rd kernel in path iteration. This kernel sets up the ShaderData
+ * structure from the values computed by the previous kernels. It also identifies
+ * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ *
+ * The input and output of the kernel is as follows,
+ * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- shader_data
+ * Ray_coop -------------------------------------------|                         |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * PathState_coop -------------------------------------|                         |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Intersection_coop ----------------------------------|                         |
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------|                         |
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---|                         |
+ * ray_state ------------------------------------------|                         |
+ * kg (globals + data) --------------------------------|                         |
+ * queuesize ------------------------------------------|                         |
+ *
+ * Note on Queues :
+ * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
+ * only the rays of state RAY_ACTIVE;
+ * State of queues when this kernel is called,
+ * at entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * at exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
+ */
+ccl_device void kernel_shader_eval(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_data,          /* Output ShaderData structure to be filled */
+        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
+        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
+        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
+        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
+        ccl_global char *ray_state,            /* Denotes the state of each ray */
+        int ray_index)
+{
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd = (ShaderData *)shader_data;
+		Intersection *isect = &Intersection_coop[ray_index];
+		ccl_global uint *rng = &rng_coop[ray_index];
+		ccl_global PathState *state = &PathState_coop[ray_index];
+		Ray ray = Ray_coop[ray_index];
+
+		shader_setup_from_ray(kg,
+		                      sd,
+		                      isect,
+		                      &ray,
+		                      state->bounce,
+		                      state->transparent_bounce);
+		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
+		shader_eval_surface(kg, sd, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h
new file mode 100644
index 00000000000..28351c2b1ae
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split_common.h"
+
+/* Note on kernel_shadow_blocked kernel.
+ * This is the ninth kernel in the ray tracing logic. This is the eighth
+ * of the path iteration kernels. This kernel takes care of "shadow ray cast"
+ * logic of the direct lighting and AO  part of ray tracing.
+ *
+ * The input and output are as follows,
+ *
+ * PathState_coop ----------------------------------|--- kernel_shadow_blocked --|
+ * LightRay_dl_coop --------------------------------|                            |--- LightRay_dl_coop
+ * LightRay_ao_coop --------------------------------|                            |--- LightRay_ao_coop
+ * ray_state ---------------------------------------|                            |--- ray_state
+ * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS &       |                            |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
+              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
+ * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
+              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
+ * kg (globals + data) -----------------------------|                            |
+ * queuesize ---------------------------------------|                            |
+ *
+ * Note on shader_shadow : shader_shadow is neither input nor output to this kernel. shader_shadow is filled and consumed in this kernel itself.
+ * Note on queues :
+ * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty
+ * these queues this kernel.
+ * State of queues when this kernel is called :
+ * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
+ * before and after this kernel call.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO
+ * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
+ */
+ccl_device void kernel_shadow_blocked(
+        ccl_global char *globals,
+        ccl_constant KernelData *data,
+        ccl_global char *shader_shadow,        /* Required for shadow blocked */
+        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
+        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
+        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
+        Intersection *Intersection_coop_AO,
+        Intersection *Intersection_coop_DL,
+        ccl_global char *ray_state,
+        int total_num_rays,
+        char shadow_blocked_type,
+        int ray_index)
+{
+	/* Flag determining if we need to update L. */
+	char update_path_radiance = 0;
+
+	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
+	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
+	{
+		/* Load kernel global structure. */
+		KernelGlobals *kg = (KernelGlobals *)globals;
+		ShaderData *sd_shadow  = (ShaderData *)shader_shadow;
+
+		ccl_global PathState *state = &PathState_coop[ray_index];
+		ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index];
+		ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index];
+		Intersection *isect_ao_global = &Intersection_coop_AO[ray_index];
+		Intersection *isect_dl_global = &Intersection_coop_DL[ray_index];
+
+		ccl_global Ray *light_ray_global =
+		        shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO
+		                 ? light_ray_ao_global
+		                 : light_ray_dl_global;
+		Intersection *isect_global =
+		        RAY_SHADOW_RAY_CAST_AO ? isect_ao_global : isect_dl_global;
+
+		float3 shadow;
+		update_path_radiance = !(shadow_blocked(kg,
+		                                        state,
+		                                        light_ray_global,
+		                                        &shadow,
+		                                        sd_shadow,
+		                                        isect_global));
+
+		/* We use light_ray_global's P and t to store shadow and
+		 * update_path_radiance.
+		 */
+		light_ray_global->P = shadow;
+		light_ray_global->t = update_path_radiance;
+	}
+}
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
new file mode 100644
index 00000000000..e1c7e2cea99
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef  __KERNEL_SPLIT_H__
+#define  __KERNEL_SPLIT_H__
+
+#include "kernel_compat_opencl.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+
+#include "util_atomic.h"
+
+#include "kernel_random.h"
+#include "kernel_projection.h"
+#include "kernel_montecarlo.h"
+#include "kernel_differential.h"
+#include "kernel_camera.h"
+
+#include "geom/geom.h"
+
+#include "kernel_accumulate.h"
+#include "kernel_shader.h"
+#include "kernel_light.h"
+#include "kernel_passes.h"
+
+#ifdef __SUBSURFACE__
+#include "kernel_subsurface.h"
+#endif
+
+#ifdef __VOLUME__
+#include "kernel_volume.h"
+#endif
+
+#include "kernel_path_state.h"
+#include "kernel_shadow.h"
+#include "kernel_emission.h"
+#include "kernel_path_common.h"
+#include "kernel_path_surface.h"
+#include "kernel_path_volume.h"
+
+#ifdef __KERNEL_DEBUG__
+#include "kernel_debug.h"
+#endif
+
+#include "kernel_queues.h"
+#include "kernel_work_stealing.h"
+
+#endif  /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
new file mode 100644
index 00000000000..a21e9b6a0b1
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../kernel_compat_opencl.h"
+#include "../kernel_math.h"
+#include "../kernel_types.h"
+#include "../kernel_globals.h"
+
+/* Since we process various samples in parallel; The output radiance of different samples
+ * are stored in different locations; This kernel combines the output radiance contributed
+ * by all different samples and stores them in the RenderTile's output buffer.
+ */
+ccl_device void kernel_sum_all_radiance(
+        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
+        ccl_global float *buffer,                    /* Output buffer of RenderTile */
+        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
+        int parallel_samples, int sw, int sh, int stride,
+        int buffer_offset_x,
+        int buffer_offset_y,
+        int buffer_stride,
+        int start_sample)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+
+	if(x < sw && y < sh) {
+		buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride);
+		per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride);
+
+		int sample_stride = (data->film.pass_stride);
+
+		int sample_iterator = 0;
+		int pass_stride_iterator = 0;
+		int num_floats = data->film.pass_stride;
+
+		for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) {
+			for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
+				*(buffer + pass_stride_iterator) =
+				        (start_sample == 0 && sample_iterator == 0)
+				                ? *(per_sample_output_buffer + pass_stride_iterator)
+				                : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
+			}
+			per_sample_output_buffer += sample_stride;
+		}
+	}
+}
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index efbffacf375..84fc0fcf587 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SVM_H__
@@ -102,7 +102,7 @@ ccl_device_inline int stack_load_int(float *stack, uint a)
 	return __float_as_int(stack[a]);
 }
 
-ccl_device_inline float stack_load_int_default(float *stack, uint a, uint value)
+ccl_device_inline int stack_load_int_default(float *stack, uint a, uint value)
 {
 	return (a == (uint)SVM_STACK_INVALID)? (int)value: stack_load_int(stack, a);
 }
@@ -157,6 +157,8 @@ CCL_NAMESPACE_END
 #include "svm_noise.h"
 #include "svm_texture.h"
 
+#include "svm_math_util.h"
+
 #include "svm_attribute.h"
 #include "svm_gradient.h"
 #include "svm_blackbody.h"
@@ -192,20 +194,24 @@ CCL_NAMESPACE_END
 #include "svm_checker.h"
 #include "svm_brick.h"
 #include "svm_vector_transform.h"
+#include "svm_voxel.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* Main Interpreter Loop */
+#define NODES_GROUP(group) ((group) <= __NODES_MAX_GROUP__)
+#define NODES_FEATURE(feature) ((__NODES_FEATURES__ & (feature)) != 0)
 
+/* Main Interpreter Loop */
 ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ShaderType type, int path_flag)
 {
 	float stack[SVM_STACK_SIZE];
-	int offset = sd->shader & SHADER_MASK;
+	int offset = ccl_fetch(sd, shader) & SHADER_MASK;
 
 	while(1) {
 		uint4 node = read_node(kg, &offset);
 
 		switch(node.x) {
+#if NODES_GROUP(NODE_GROUP_LEVEL_0)
 			case NODE_SHADER_JUMP: {
 				if(type == SHADER_TYPE_SURFACE) offset = node.y;
 				else if(type == SHADER_TYPE_VOLUME) offset = node.z;
@@ -222,15 +228,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_CLOSURE_BACKGROUND:
 				svm_node_closure_background(sd, stack, node);
 				break;
-			case NODE_CLOSURE_HOLDOUT:
-				svm_node_closure_holdout(sd, stack, node);
-				break;
-			case NODE_CLOSURE_AMBIENT_OCCLUSION:
-				svm_node_closure_ambient_occlusion(sd, stack, node);
-				break;
-			case NODE_CLOSURE_VOLUME:
-				svm_node_closure_volume(kg, sd, stack, node, path_flag);
-				break;
 			case NODE_CLOSURE_SET_WEIGHT:
 				svm_node_closure_set_weight(sd, node.y, node.z, node.w);
 				break;
@@ -251,13 +248,137 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 				if(stack_load_float(stack, node.z) == 1.0f)
 					offset += node.y;
 				break;
-#ifdef __TEXTURES__
+			case NODE_GEOMETRY:
+				svm_node_geometry(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_CONVERT:
+				svm_node_convert(sd, stack, node.y, node.z, node.w);
+				break;
+			case NODE_TEX_COORD:
+				svm_node_tex_coord(kg, sd, path_flag, stack, node, &offset);
+				break;
+			case NODE_VALUE_F:
+				svm_node_value_f(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_VALUE_V:
+				svm_node_value_v(kg, sd, stack, node.y, &offset);
+				break;
+			case NODE_ATTR:
+				svm_node_attr(kg, sd, stack, node);
+				break;
+#  if NODES_FEATURE(NODE_FEATURE_BUMP)
+			case NODE_GEOMETRY_BUMP_DX:
+				svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_GEOMETRY_BUMP_DY:
+				svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_SET_DISPLACEMENT:
+				svm_node_set_displacement(sd, stack, node.y);
+				break;
+#  endif  /* NODES_FEATURE(NODE_FEATURE_BUMP) */
+#  ifdef __TEXTURES__
 			case NODE_TEX_IMAGE:
 				svm_node_tex_image(kg, sd, stack, node);
 				break;
 			case NODE_TEX_IMAGE_BOX:
 				svm_node_tex_image_box(kg, sd, stack, node);
 				break;
+			case NODE_TEX_NOISE:
+				svm_node_tex_noise(kg, sd, stack, node, &offset);
+				break;
+#  endif  /* __TEXTURES__ */
+#  ifdef __EXTRA_NODES__
+#    if NODES_FEATURE(NODE_FEATURE_BUMP)
+			case NODE_SET_BUMP:
+				svm_node_set_bump(kg, sd, stack, node);
+				break;
+			case NODE_ATTR_BUMP_DX:
+				svm_node_attr_bump_dx(kg, sd, stack, node);
+				break;
+			case NODE_ATTR_BUMP_DY:
+				svm_node_attr_bump_dy(kg, sd, stack, node);
+				break;
+			case NODE_TEX_COORD_BUMP_DX:
+				svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, &offset);
+				break;
+			case NODE_TEX_COORD_BUMP_DY:
+				svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, &offset);
+				break;
+			case NODE_CLOSURE_SET_NORMAL:
+				svm_node_set_normal(kg, sd, stack, node.y, node.z);
+				break;
+#    endif  /* NODES_FEATURE(NODE_FEATURE_BUMP) */
+			case NODE_HSV:
+				svm_node_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
+				break;
+#  endif  /* __EXTRA_NODES__ */
+#endif  /* NODES_GROUP(NODE_GROUP_LEVEL_0) */
+
+#if NODES_GROUP(NODE_GROUP_LEVEL_1)
+			case NODE_CLOSURE_HOLDOUT:
+				svm_node_closure_holdout(sd, stack, node);
+				break;
+			case NODE_CLOSURE_AMBIENT_OCCLUSION:
+				svm_node_closure_ambient_occlusion(sd, stack, node);
+				break;
+			case NODE_FRESNEL:
+				svm_node_fresnel(sd, stack, node.y, node.z, node.w);
+				break;
+			case NODE_LAYER_WEIGHT:
+				svm_node_layer_weight(sd, stack, node);
+				break;
+#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
+			case NODE_CLOSURE_VOLUME:
+				svm_node_closure_volume(kg, sd, stack, node, path_flag);
+				break;
+#  endif  /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
+#  ifdef __EXTRA_NODES__
+			case NODE_MATH:
+				svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+				break;
+			case NODE_VECTOR_MATH:
+				svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+				break;
+			case NODE_RGB_RAMP:
+				svm_node_rgb_ramp(kg, sd, stack, node, &offset);
+				break;
+			case NODE_GAMMA:
+				svm_node_gamma(sd, stack, node.y, node.z, node.w);
+				break;
+			case NODE_BRIGHTCONTRAST:
+				svm_node_brightness(sd, stack, node.y, node.z, node.w);
+				break;
+			case NODE_LIGHT_PATH:
+				svm_node_light_path(sd, stack, node.y, node.z, path_flag);
+				break;
+			case NODE_OBJECT_INFO:
+				svm_node_object_info(kg, sd, stack, node.y, node.z);
+				break;
+			case NODE_PARTICLE_INFO:
+				svm_node_particle_info(kg, sd, stack, node.y, node.z);
+				break;
+#    ifdef __HAIR__
+#      if NODES_FEATURE(NODE_FEATURE_HAIR)
+			case NODE_HAIR_INFO:
+				svm_node_hair_info(kg, sd, stack, node.y, node.z);
+				break;
+#      endif  /* NODES_FEATURE(NODE_FEATURE_HAIR) */
+#    endif  /* __HAIR__ */
+#  endif  /* __EXTRA_NODES__ */
+#endif  /* NODES_GROUP(NODE_GROUP_LEVEL_1) */
+
+#if NODES_GROUP(NODE_GROUP_LEVEL_2)
+			case NODE_MAPPING:
+				svm_node_mapping(kg, sd, stack, node.y, node.z, &offset);
+				break;
+			case NODE_MIN_MAX:
+				svm_node_min_max(kg, sd, stack, node.y, node.z, &offset);
+				break;
+			case NODE_CAMERA:
+				svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
+				break;
+#  ifdef __TEXTURES__
 			case NODE_TEX_ENVIRONMENT:
 				svm_node_tex_environment(kg, sd, stack, node);
 				break;
@@ -267,9 +388,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_TEX_GRADIENT:
 				svm_node_tex_gradient(sd, stack, node);
 				break;
-			case NODE_TEX_NOISE:
-				svm_node_tex_noise(kg, sd, stack, node, &offset);
-				break;
 			case NODE_TEX_VORONOI:
 				svm_node_tex_voronoi(kg, sd, stack, node, &offset);
 				break;
@@ -288,55 +406,34 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_TEX_BRICK:
 				svm_node_tex_brick(kg, sd, stack, node, &offset);
 				break;
-#endif
-			case NODE_CAMERA:
-				svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
-				break;
-			case NODE_GEOMETRY:
-				svm_node_geometry(kg, sd, stack, node.y, node.z);
-				break;
-#ifdef __EXTRA_NODES__
-			case NODE_GEOMETRY_BUMP_DX:
-				svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
-				break;
-			case NODE_GEOMETRY_BUMP_DY:
-				svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
-				break;
-			case NODE_LIGHT_PATH:
-				svm_node_light_path(sd, stack, node.y, node.z, path_flag);
-				break;
-			case NODE_OBJECT_INFO:
-				svm_node_object_info(kg, sd, stack, node.y, node.z);
-				break;
-			case NODE_PARTICLE_INFO:
-				svm_node_particle_info(kg, sd, stack, node.y, node.z);
+#  endif  /* __TEXTURES__ */
+#  ifdef __EXTRA_NODES__
+			case NODE_NORMAL:
+				svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset);
 				break;
-#ifdef __HAIR__
-			case NODE_HAIR_INFO:
-				svm_node_hair_info(kg, sd, stack, node.y, node.z);
+			case NODE_LIGHT_FALLOFF:
+				svm_node_light_falloff(sd, stack, node);
 				break;
-#endif
+#  endif  /* __EXTRA_NODES__ */
+#endif  /* NODES_GROUP(NODE_GROUP_LEVEL_2) */
 
-#endif
-			case NODE_CONVERT:
-				svm_node_convert(sd, stack, node.y, node.z, node.w);
+#if NODES_GROUP(NODE_GROUP_LEVEL_3)
+			case NODE_RGB_CURVES:
+				svm_node_rgb_curves(kg, sd, stack, node, &offset);
 				break;
-			case NODE_VALUE_F:
-				svm_node_value_f(kg, sd, stack, node.y, node.z);
+			case NODE_VECTOR_CURVES:
+				svm_node_vector_curves(kg, sd, stack, node, &offset);
 				break;
-			case NODE_VALUE_V:
-				svm_node_value_v(kg, sd, stack, node.y, &offset);
+			case NODE_TANGENT:
+				svm_node_tangent(kg, sd, stack, node);
 				break;
-#ifdef __EXTRA_NODES__
+			case NODE_NORMAL_MAP:
+				svm_node_normal_map(kg, sd, stack, node);
+				break;
+#  ifdef __EXTRA_NODES__
 			case NODE_INVERT:
 				svm_node_invert(sd, stack, node.y, node.z, node.w);
 				break;
-			case NODE_GAMMA:
-				svm_node_gamma(sd, stack, node.y, node.z, node.w);
-				break;
-			case NODE_BRIGHTCONTRAST:
-				svm_node_brightness(sd, stack, node.y, node.z, node.w);
-				break;
 			case NODE_MIX:
 				svm_node_mix(kg, sd, stack, node.y, node.z, node.w, &offset);
 				break;
@@ -352,30 +449,11 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_COMBINE_HSV:
 				svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
 				break;
-			case NODE_HSV:
-				svm_node_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
-				break;
-#endif
-			case NODE_ATTR:
-				svm_node_attr(kg, sd, stack, node);
-				break;
-#ifdef __EXTRA_NODES__
-			case NODE_ATTR_BUMP_DX:
-				svm_node_attr_bump_dx(kg, sd, stack, node);
-				break;
-			case NODE_ATTR_BUMP_DY:
-				svm_node_attr_bump_dy(kg, sd, stack, node);
-				break;
-#endif
-			case NODE_FRESNEL:
-				svm_node_fresnel(sd, stack, node.y, node.z, node.w);
-				break;
-			case NODE_LAYER_WEIGHT:
-				svm_node_layer_weight(sd, stack, node);
+			case NODE_VECTOR_TRANSFORM:
+				svm_node_vector_transform(kg, sd, stack, node);
 				break;
-#ifdef __EXTRA_NODES__
 			case NODE_WIREFRAME:
-				svm_node_wireframe(kg, sd, stack, node.y, node.z, node.w);
+				svm_node_wireframe(kg, sd, stack, node);
 				break;
 			case NODE_WAVELENGTH:
 				svm_node_wavelength(sd, stack, node.y, node.z);
@@ -383,70 +461,25 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_BLACKBODY:
 				svm_node_blackbody(kg, sd, stack, node.y, node.z);
 				break;
-			case NODE_SET_DISPLACEMENT:
-				svm_node_set_displacement(sd, stack, node.y);
-				break;
-			case NODE_SET_BUMP:
-				svm_node_set_bump(kg, sd, stack, node);
-				break;
-			case NODE_MATH:
-				svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset);
-				break;
-			case NODE_VECTOR_MATH:
-				svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, &offset);
-				break;
-			case NODE_VECTOR_TRANSFORM:
-				svm_node_vector_transform(kg, sd, stack, node);
-				break;
-			case NODE_NORMAL:
-				svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset);
-				break;
-#endif
-			case NODE_MAPPING:
-				svm_node_mapping(kg, sd, stack, node.y, node.z, &offset);
-				break;
-			case NODE_MIN_MAX:
-				svm_node_min_max(kg, sd, stack, node.y, node.z, &offset);
-				break;
-			case NODE_TEX_COORD:
-				svm_node_tex_coord(kg, sd, path_flag, stack, node.y, node.z);
-				break;
-#ifdef __EXTRA_NODES__
-			case NODE_TEX_COORD_BUMP_DX:
-				svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node.y, node.z);
-				break;
-			case NODE_TEX_COORD_BUMP_DY:
-				svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node.y, node.z);
-				break;
-			case NODE_CLOSURE_SET_NORMAL:
-				svm_node_set_normal(kg, sd, stack, node.y, node.z );
-				break;
-			case NODE_RGB_RAMP:
-				svm_node_rgb_ramp(kg, sd, stack, node, &offset);
-				break;
-			case NODE_RGB_CURVES:
-				svm_node_rgb_curves(kg, sd, stack, node, &offset);
-				break;
-			case NODE_VECTOR_CURVES:
-				svm_node_vector_curves(kg, sd, stack, node, &offset);
-				break;
-			case NODE_LIGHT_FALLOFF:
-				svm_node_light_falloff(sd, stack, node);
-				break;
-#endif
-			case NODE_TANGENT:
-				svm_node_tangent(kg, sd, stack, node);
-				break;
-			case NODE_NORMAL_MAP:
-				svm_node_normal_map(kg, sd, stack, node);
+#  endif  /* __EXTRA_NODES__ */
+#  if NODES_FEATURE(NODE_FEATURE_VOLUME) && !defined(__KERNEL_GPU__)
+			case NODE_TEX_VOXEL:
+				svm_node_tex_voxel(kg, sd, stack, node, &offset);
 				break;
+#  endif  /* NODES_FEATURE(NODE_FEATURE_VOLUME) && !defined(__KERNEL_GPU__) */
+#endif  /* NODES_GROUP(NODE_GROUP_LEVEL_3) */
 			case NODE_END:
+				return;
 			default:
+				kernel_assert(!"Unknown node type was passed to the SVM machine");
 				return;
 		}
 	}
 }
 
+#undef NODES_GROUP
+#undef NODES_FEATURE
+
 CCL_NAMESPACE_END
 
 #ifdef __CAMERA_RAY_NODES__
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index fd0ea7fef31..025ae96f59d 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -22,12 +22,12 @@ ccl_device void svm_node_attr_init(KernelGlobals *kg, ShaderData *sd,
 	uint4 node, NodeAttributeType *type,
 	NodeAttributeType *mesh_type, AttributeElement *elem, int *offset, uint *out_offset)
 {
-	if(sd->object != OBJECT_NONE) {
+	if(ccl_fetch(sd, object) != OBJECT_NONE) {
 		/* find attribute by unique id */
 		uint id = node.y;
-		uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
+		uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
 #ifdef __HAIR__
-		attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
+		attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
 #endif
 		uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 		
diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h
index 15257aed92e..b750ad87b7f 100644
--- a/intern/cycles/kernel/svm/svm_blackbody.h
+++ b/intern/cycles/kernel/svm/svm_blackbody.h
@@ -36,48 +36,12 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *stack, uint temperature_offset, uint col_offset)
 {
-	/* Output */
-	float3 color_rgb = make_float3(0.0f, 0.0f, 0.0f);
-
 	/* Input */
 	float temperature = stack_load_float(stack, temperature_offset);
 
-	if (temperature < BB_DRAPPER) {
-		/* just return very very dim red */
-		color_rgb = make_float3(1.0e-6f,0.0f,0.0f);
-	}
-	else if (temperature <= BB_MAX_TABLE_RANGE) {
-		/* This is the overall size of the table */
-		const int lookuptablesize = 956;
-		const float lookuptablenormalize = 1.0f/956.0f;
-
-		/* reconstruct a proper index for the table lookup, compared to OSL we don't look up two colors
-		just one (the OSL-lerp is also automatically done for us by "lookup_table_read") */
-		float t = powf((temperature - BB_DRAPPER) * (1.0f / BB_TABLE_SPACING), (1.0f / BB_TABLE_XPOWER));
-
-		int blackbody_table_offset = kernel_data.tables.blackbody_offset;
-
-		/* Retrieve colors from the lookup table */
-		float lutval = t*lookuptablenormalize;
-		float R = lookup_table_read(kg, lutval, blackbody_table_offset, lookuptablesize);
-		lutval = (t + 319.0f*1.0f)*lookuptablenormalize;
-		float G = lookup_table_read(kg, lutval, blackbody_table_offset, lookuptablesize);
-		lutval = (t + 319.0f*2.0f)*lookuptablenormalize;
-		float B = lookup_table_read(kg, lutval, blackbody_table_offset, lookuptablesize);
-
-		R = powf(R, BB_TABLE_YPOWER);
-		G = powf(G, BB_TABLE_YPOWER);
-		B = powf(B, BB_TABLE_YPOWER);
-
-		color_rgb = make_float3(R, G, B);
-	}
-
-	/* Luminance */
-	float l = linear_rgb_to_gray(color_rgb);
-	if (l != 0.0f)
-		color_rgb /= l;
+	float3 color_rgb = svm_math_blackbody_color(temperature);
 
-	if (stack_valid(col_offset))
+	if(stack_valid(col_offset))
 		stack_store_float3(stack, col_offset, color_rgb);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h
index 97c2b545c5f..9b0cf5ab8c4 100644
--- a/intern/cycles/kernel/svm/svm_brick.h
+++ b/intern/cycles/kernel/svm/svm_brick.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -21,6 +21,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_noinline float brick_noise(int n) /* fast integer noise */
 {
 	int nn;
+	n = (n + 1013) & 0x7fffffff;
 	n = (n >> 13) ^ n;
 	nn = (n * (n * n * 60493 + 19990303) + 1376312589) & 0x7fffffff;
 	return 0.5f * ((float)nn / 1073741824.0f);
@@ -47,7 +48,7 @@ ccl_device_noinline float2 svm_brick(float3 p, float mortar_size, float bias,
 	y = p.y - row_height*rownum;
 
 	return make_float2(
-		clamp((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias), 0.0f, 1.0f),
+		saturate((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias)),
 
 		(x < mortar_size || y < mortar_size ||
 		x > (brick_width - mortar_size) ||
@@ -95,10 +96,7 @@ ccl_device void svm_node_tex_brick(KernelGlobals *kg, ShaderData *sd, float *sta
 	
 	if(f != 1.0f) {
 		float facm = 1.0f - tint;
-
-		color1.x = facm * (color1.x) + tint * color2.x;
-		color1.y = facm * (color1.y) + tint * color2.y;
-		color1.z = facm * (color1.z) + tint * color2.z;
+		color1 = facm * color1 + tint * color2;
 	}
 
 	if(stack_valid(color_offset))
diff --git a/intern/cycles/kernel/svm/svm_brightness.h b/intern/cycles/kernel/svm/svm_brightness.h
index 9b330b3213f..e4d545a00ae 100644
--- a/intern/cycles/kernel/svm/svm_brightness.h
+++ b/intern/cycles/kernel/svm/svm_brightness.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -32,7 +32,7 @@ ccl_device void svm_node_brightness(ShaderData *sd, float *stack, uint in_color,
 	color.y = max(a*color.y + b, 0.0f);
 	color.z = max(a*color.z + b, 0.0f);
 
-	if (stack_valid(out_color))
+	if(stack_valid(out_color))
 		stack_store_float3(stack, out_color, color);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index bfe9289fa02..00678a49d70 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -23,17 +23,17 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack,
 	float3 vector;
 
 	Transform tfm = kernel_data.cam.worldtocamera;
-	vector = transform_point(&tfm, sd->P);
+	vector = transform_point(&tfm, ccl_fetch(sd, P));
 	zdepth = vector.z;
 	distance = len(vector);
 
-	if (stack_valid(out_vector))
+	if(stack_valid(out_vector))
 		stack_store_float3(stack, out_vector, normalize(vector));
 
-	if (stack_valid(out_zdepth))
+	if(stack_valid(out_zdepth))
 		stack_store_float(stack, out_zdepth, zdepth);
 
-	if (stack_valid(out_distance))
+	if(stack_valid(out_distance))
 		stack_store_float(stack, out_distance, distance);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_checker.h b/intern/cycles/kernel/svm/svm_checker.h
index e0408ad334a..186bf7df55f 100644
--- a/intern/cycles/kernel/svm/svm_checker.h
+++ b/intern/cycles/kernel/svm/svm_checker.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 30110db3ef9..c495ebb35bd 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -25,10 +25,14 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 			sc->data0 = eta;
 			sc->data1 = 0.0f;
 			sc->data2 = 0.0f;
-			sd->flag |= bsdf_refraction_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc);
+		}
+		else {
+			sc->data0 = 0.0f;
+			sc->data1 = 0.0f;
+			sc->data2 = 0.0f;
+			ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc);
 		}
-		else
-			sd->flag |= bsdf_reflection_setup(sc);
 	}
 	else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) {
 		sc->data0 = roughness;
@@ -36,9 +40,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 		sc->data2 = eta;
 
 		if(refract)
-			sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc);
 		else
-			sd->flag |= bsdf_microfacet_beckmann_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc);
 	}
 	else {
 		sc->data0 = roughness;
@@ -46,23 +50,26 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 		sc->data2 = eta;
 
 		if(refract)
-			sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc);
 		else
-			sd->flag |= bsdf_microfacet_ggx_setup(sc);
+			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc);
 	}
 }
 
 ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, ClosureType type, float mix_weight)
 {
-	ShaderClosure *sc = &sd->closure[sd->num_closure];
+	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 
-	if(sd->num_closure < MAX_CLOSURE) {
+	if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
 		sc->weight *= mix_weight;
 		sc->type = type;
+		sc->data0 = 0.0f;
+		sc->data1 = 0.0f;
+		sc->data2 = 0.0f;
 #ifdef __OSL__
 		sc->prim = NULL;
 #endif
-		sd->num_closure++;
+		ccl_fetch(sd, num_closure)++;
 		return sc;
 	}
 
@@ -71,14 +78,15 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, C
 
 ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float mix_weight)
 {
-	ShaderClosure *sc = &sd->closure[sd->num_closure];
+	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+
 	float3 weight = sc->weight * mix_weight;
 	float sample_weight = fabsf(average(weight));
 
-	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
+	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
 		sc->weight = weight;
 		sc->sample_weight = sample_weight;
-		sd->num_closure++;
+		ccl_fetch(sd, num_closure)++;
 #ifdef __OSL__
 		sc->prim = NULL;
 #endif
@@ -90,14 +98,15 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float
 
 ccl_device_inline ShaderClosure *svm_node_closure_get_absorption(ShaderData *sd, float mix_weight)
 {
-	ShaderClosure *sc = &sd->closure[sd->num_closure];
+	ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+
 	float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sc->weight) * mix_weight;
 	float sample_weight = fabsf(average(weight));
 
-	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
+	if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
 		sc->weight = weight;
 		sc->sample_weight = sample_weight;
-		sd->num_closure++;
+		ccl_fetch(sd, num_closure)++;
 #ifdef __OSL__
 		sc->prim = NULL;
 #endif
@@ -121,7 +130,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 	if(mix_weight == 0.0f)
 		return;
 
-	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N; 
+	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N);
 
 	float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z);
 	float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
@@ -139,13 +148,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data0 = 0.0f;
 					sc->data1 = 0.0f;
 					sc->data2 = 0.0f;
-					sd->flag |= bsdf_diffuse_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_diffuse_setup(sc);
 				}
 				else {
 					sc->data0 = roughness;
 					sc->data1 = 0.0f;
 					sc->data2 = 0.0f;
-					sd->flag |= bsdf_oren_nayar_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(sc);
 				}
 			}
 			break;
@@ -158,7 +167,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->data1 = 0.0f;
 				sc->data2 = 0.0f;
 				sc->N = N;
-				sd->flag |= bsdf_translucent_setup(sc);
+				ccl_fetch(sd, flag) |= bsdf_translucent_setup(sc);
 			}
 			break;
 		}
@@ -170,7 +179,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->data1 = 0.0f;
 				sc->data2 = 0.0f;
 				sc->N = N;
-				sd->flag |= bsdf_transparent_setup(sc);
+				ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc);
 			}
 			break;
 		}
@@ -192,13 +201,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFLECTION_ID)
-					sd->flag |= bsdf_reflection_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc);
 				else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
-					sd->flag |= bsdf_microfacet_beckmann_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc);
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
-					sd->flag |= bsdf_microfacet_ggx_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc);
 				else
-					sd->flag |= bsdf_ashikhmin_shirley_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(sc);
 			}
 
 			break;
@@ -216,7 +225,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->N = N;
 
 				float eta = fmaxf(param2, 1e-5f);
-				eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+				eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFRACTION_ID) {
@@ -224,7 +233,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data1 = 0.0f;
 					sc->data2 = 0.0f;
 
-					sd->flag |= bsdf_refraction_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc);
 				}
 				else {
 					sc->data0 = param1;
@@ -232,9 +241,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data2 = eta;
 
 					if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
-						sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc);
 					else
-						sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc);
 				}
 			}
 
@@ -251,15 +260,15 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #endif
 			/* index of refraction */
 			float eta = fmaxf(param2, 1e-5f);
-			eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+			eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
 
 			/* fresnel */
-			float cosNO = dot(N, sd->I);
+			float cosNO = dot(N, ccl_fetch(sd, I));
 			float fresnel = fresnel_dielectric_cos(cosNO, eta);
 			float roughness = param1;
 
 			/* reflection */
-			ShaderClosure *sc = &sd->closure[sd->num_closure];
+			ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 			float3 weight = sc->weight;
 			float sample_weight = sc->sample_weight;
 
@@ -280,15 +289,17 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #endif
 
 			/* refraction */
-			sc = &sd->closure[sd->num_closure];
-			sc->weight = weight;
-			sc->sample_weight = sample_weight;
+			if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
+				sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+				sc->weight = weight;
+				sc->sample_weight = sample_weight;
 
-			sc = svm_node_closure_get_bsdf(sd, mix_weight*(1.0f - fresnel));
+				sc = svm_node_closure_get_bsdf(sd, mix_weight*(1.0f - fresnel));
 
-			if(sc) {
-				sc->N = N;
-				svm_node_glass_setup(sd, sc, type, eta, roughness, true);
+				if(sc) {
+					sc->N = N;
+					svm_node_glass_setup(sd, sc, type, eta, roughness, true);
+				}
 			}
 
 			break;
@@ -328,12 +339,12 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 				sc->data2 = 0.0f;
 
-				if (type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID)
-					sd->flag |= bsdf_microfacet_beckmann_aniso_setup(sc);
-				else if (type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID)
-					sd->flag |= bsdf_microfacet_ggx_aniso_setup(sc);
+				if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID)
+					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(sc);
+				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID)
+					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(sc);
 				else
-					sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(sc);
 			}
 			break;
 		}
@@ -344,10 +355,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->N = N;
 
 				/* sigma */
-				sc->data0 = clamp(param1, 0.0f, 1.0f);
+				sc->data0 = saturate(param1);
 				sc->data1 = 0.0f;
 				sc->data2 = 0.0f;
-				sd->flag |= bsdf_ashikhmin_velvet_setup(sc);
+				ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(sc);
 			}
 			break;
 		}
@@ -362,10 +373,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->data1 = param2;
 				sc->data2 = 0.0f;
 				
-				if (type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
-					sd->flag |= bsdf_diffuse_toon_setup(sc);
+				if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
+					ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(sc);
 				else
-					sd->flag |= bsdf_glossy_toon_setup(sc);
+					ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(sc);
 			}
 			break;
 		}
@@ -373,7 +384,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
 			
-			if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) {
+			if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 				ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
 
 				if(sc) {
@@ -384,11 +395,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					 * spawned by transmission from the front */
 					sc->weight = make_float3(1.0f, 1.0f, 1.0f);
 					sc->N = N;
-					sd->flag |= bsdf_transparent_setup(sc);
+					sc->data0 = 0.0f;
+					sc->data1 = 0.0f;
+					sc->data2 = 0.0f;
+					ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc);
 				}
 			}
 			else {
-				ShaderClosure *sc = &sd->closure[sd->num_closure];
+				ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 				sc = svm_node_closure_get_bsdf(sd, mix_weight);
 
 				if(sc) {
@@ -397,18 +411,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data1 = param2;
 					sc->data2 = -stack_load_float(stack, data_node.z);
 
-					if(!(sd->type & PRIMITIVE_ALL_CURVE)) {
-						sc->T = normalize(sd->dPdv);
+					if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) {
+						sc->T = normalize(ccl_fetch(sd, dPdv));
 						sc->data2 = 0.0f;
 					}
 					else
-						sc->T = sd->dPdu;
+						sc->T = normalize(ccl_fetch(sd, dPdu));
 
 					if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
-						sd->flag |= bsdf_hair_reflection_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(sc);
 					}
 					else {
-						sd->flag |= bsdf_hair_transmission_setup(sc);
+						ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(sc);
 					}
 				}
 			}
@@ -418,9 +432,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #endif
 
 #ifdef __SUBSURFACE__
+#ifndef __SPLIT_KERNEL__
+#  define sc_next(sc) sc++
+#  else
+#  define sc_next(sc) sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure))
+#  endif
 		case CLOSURE_BSSRDF_CUBIC_ID:
 		case CLOSURE_BSSRDF_GAUSSIAN_ID: {
-			ShaderClosure *sc = &sd->closure[sd->num_closure];
+			ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
 			float3 weight = sc->weight * mix_weight;
 			float sample_weight = fabsf(average(weight));
 			
@@ -430,7 +449,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
 				param1 = 0.0f;
 
-			if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure+2 < MAX_CLOSURE) {
+			if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure)+2 < MAX_CLOSURE) {
 				/* radius * scale */
 				float3 radius = stack_load_float3(stack, data_node.z)*param1;
 				/* sharpness */
@@ -450,10 +469,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->prim = NULL;
 #endif
 					sc->N = N;
-					sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
 
-					sd->num_closure++;
-					sc++;
+					ccl_fetch(sd, num_closure)++;
+					sc_next(sc);
 				}
 
 				if(fabsf(weight.y) > 0.0f) {
@@ -467,10 +486,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->prim = NULL;
 #endif
 					sc->N = N;
-					sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
 
-					sd->num_closure++;
-					sc++;
+					ccl_fetch(sd, num_closure)++;
+					sc_next(sc);
 				}
 
 				if(fabsf(weight.z) > 0.0f) {
@@ -484,15 +503,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->prim = NULL;
 #endif
 					sc->N = N;
-					sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+					ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
 
-					sd->num_closure++;
-					sc++;
+					ccl_fetch(sd, num_closure)++;
+					sc_next(sc);
 				}
 			}
 
 			break;
 		}
+#  undef sc_next
 #endif
 		default:
 			break;
@@ -520,7 +540,7 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 			ShaderClosure *sc = svm_node_closure_get_absorption(sd, mix_weight * density);
 
 			if(sc) {
-				sd->flag |= volume_absorption_setup(sc);
+				ccl_fetch(sd, flag) |= volume_absorption_setup(sc);
 			}
 			break;
 		}
@@ -528,9 +548,10 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight * density);
 
 			if(sc) {
-				float g = param2;
-				sc->data0 = g;
-				sd->flag |= volume_henyey_greenstein_setup(sc);
+				sc->data0 = param2; /* g */
+				sc->data1 = 0.0f;
+				sc->data2 = 0.0f;
+				ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(sc);
 			}
 			break;
 		}
@@ -555,7 +576,7 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_EMISSION_ID, 1.0f);
 
-	sd->flag |= SD_EMISSION;
+	ccl_fetch(sd, flag) |= SD_EMISSION;
 }
 
 ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
@@ -589,7 +610,7 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_HOLDOUT_ID, 1.0f);
 
-	sd->flag |= SD_HOLDOUT;
+	ccl_fetch(sd, flag) |= SD_HOLDOUT;
 }
 
 ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node)
@@ -607,15 +628,17 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
 	else
 		svm_node_closure_get_non_bsdf(sd, CLOSURE_AMBIENT_OCCLUSION_ID, 1.0f);
 
-	sd->flag |= SD_AO;
+	ccl_fetch(sd, flag) |= SD_AO;
 }
 
 /* Closure Nodes */
 
 ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
 {
-	if(sd->num_closure < MAX_CLOSURE)
-		sd->closure[sd->num_closure].weight = weight;
+	if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
+		ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+		sc->weight = weight;
+	}
 }
 
 ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
@@ -650,7 +673,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 	decode_node_uchar4(node.y, &weight_offset, &in_weight_offset, &weight1_offset, &weight2_offset);
 
 	float weight = stack_load_float(stack, weight_offset);
-	weight = clamp(weight, 0.0f, 1.0f);
+	weight = saturate(weight);
 
 	float in_weight = (stack_valid(in_weight_offset))? stack_load_float(stack, in_weight_offset): 1.0f;
 
@@ -665,7 +688,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
 {
 	float3 normal = stack_load_float3(stack, in_direction);
-	sd->N = normal;
+	ccl_fetch(sd, N) = normal;
 	stack_store_float3(stack, out_normal, normal);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_convert.h b/intern/cycles/kernel/svm/svm_convert.h
index b221e0728ec..34080377083 100644
--- a/intern/cycles/kernel/svm/svm_convert.h
+++ b/intern/cycles/kernel/svm/svm_convert.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 6cd5ee4b375..8d4b07c9973 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -25,11 +25,11 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	uint normal_offset, distance_offset, invert;
 	decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, NULL);
 
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
 
 	/* get surface tangents from normal */
-	float3 Rx = cross(sd->dP.dy, normal_in);
-	float3 Ry = cross(normal_in, sd->dP.dx);
+	float3 Rx = cross(ccl_fetch(sd, dP).dy, normal_in);
+	float3 Ry = cross(normal_in, ccl_fetch(sd, dP).dx);
 
 	/* get bump values */
 	uint c_offset, x_offset, y_offset, strength_offset;
@@ -40,7 +40,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	float h_y = stack_load_float(stack, y_offset);
 
 	/* compute surface gradient and determinant */
-	float det = dot(sd->dP.dx, Rx);
+	float det = dot(ccl_fetch(sd, dP).dx, Rx);
 	float3 surfgrad = (h_x - h_c)*Rx + (h_y - h_c)*Ry;
 
 	float absdet = fabsf(det);
@@ -65,7 +65,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 ccl_device void svm_node_set_displacement(ShaderData *sd, float *stack, uint fac_offset)
 {
 	float d = stack_load_float(stack, fac_offset);
-	sd->P += sd->N*d*0.1f; /* todo: get rid of this factor */
+	ccl_fetch(sd, P) += ccl_fetch(sd, N)*d*0.1f; /* todo: get rid of this factor */
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 5def52205eb..23c97d80cb0 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset,
 	uint normal_offset, out_offset;
 	decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL);
 	float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value);
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
 	
 	eta = fmaxf(eta, 1e-5f);
-	eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+	eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
 
-	float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
+	float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
 
 	stack_store_float(stack, out_offset, f);
 }
@@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
 	decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL);
 
 	float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value);
-	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N;
+	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
 
 	float f;
 
 	if(type == NODE_LAYER_WEIGHT_FRESNEL) {
 		float eta = fmaxf(1.0f - blend, 1e-5f);
-		eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta;
+		eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta;
 
-		f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
+		f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
 	}
 	else {
-		f = fabsf(dot(sd->I, normal_in));
+		f = fabsf(dot(ccl_fetch(sd, I), normal_in));
 
 		if(blend != 0.5f) {
 			blend = clamp(blend, 0.0f, 1.0f-1e-5f);
diff --git a/intern/cycles/kernel/svm/svm_gamma.h b/intern/cycles/kernel/svm/svm_gamma.h
index c4749e7b936..b645ff3f0f9 100644
--- a/intern/cycles/kernel/svm/svm_gamma.h
+++ b/intern/cycles/kernel/svm/svm_gamma.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -21,14 +21,14 @@ ccl_device void svm_node_gamma(ShaderData *sd, float *stack, uint in_gamma, uint
 	float3 color = stack_load_float3(stack, in_color);
 	float gamma = stack_load_float(stack, in_gamma);
 
-	if (color.x > 0.0f)
+	if(color.x > 0.0f)
 		color.x = powf(color.x, gamma);
-	if (color.y > 0.0f)
+	if(color.y > 0.0f)
 		color.y = powf(color.y, gamma);
-	if (color.z > 0.0f)
+	if(color.z > 0.0f)
 		color.z = powf(color.z, gamma);
 
-	if (stack_valid(out_color))
+	if(stack_valid(out_color))
 		stack_store_float3(stack, out_color, color);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index fe681ec92af..bb06254c3a9 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -23,15 +23,15 @@ ccl_device void svm_node_geometry(KernelGlobals *kg, ShaderData *sd, float *stac
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = sd->P; break;
-		case NODE_GEOM_N: data = sd->N; break;
+		case NODE_GEOM_P: data = ccl_fetch(sd, P); break;
+		case NODE_GEOM_N: data = ccl_fetch(sd, N); break;
 #ifdef __DPDU__
 		case NODE_GEOM_T: data = primitive_tangent(kg, sd); break;
 #endif
-		case NODE_GEOM_I: data = sd->I; break;
-		case NODE_GEOM_Ng: data = sd->Ng; break;
+		case NODE_GEOM_I: data = ccl_fetch(sd, I); break;
+		case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break;
 #ifdef __UV__
-		case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
+		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break;
 #endif
 	}
 
@@ -44,8 +44,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = sd->P + sd->dP.dx; break;
-		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break;
+		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break;
+		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -61,8 +61,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = sd->P + sd->dP.dy; break;
-		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break;
+		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break;
+		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -83,9 +83,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s
 			stack_store_float3(stack, out_offset, object_location(kg, sd));
 			return;
 		}
-		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break;
+		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break;
 		case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break;
-		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break;
+		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break;
 		default: data = 0.0f; break;
 	}
 
@@ -98,44 +98,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float
 {
 	switch(type) {
 		case NODE_INFO_PAR_INDEX: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_index(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_AGE: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_age(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LIFETIME: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LOCATION: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float3(stack, out_offset, particle_location(kg, particle_id));
 			break;
 		}
 #if 0	/* XXX float4 currently not supported in SVM stack */
 		case NODE_INFO_PAR_ROTATION: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id));
 			break;
 		}
 #endif
 		case NODE_INFO_PAR_SIZE: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float(stack, out_offset, particle_size(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_ANGULAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, sd->object);
+			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
 			stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id));
 			break;
 		}
@@ -153,7 +153,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta
 
 	switch(type) {
 		case NODE_INFO_CURVE_IS_STRAND: {
-			data = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
+			data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}
@@ -165,7 +165,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta
 			break;
 		}
 		/*case NODE_INFO_CURVE_FADE: {
-			data = sd->curve_transparency;
+			data = ccl_fetch(sd, curve_transparency);
 			stack_store_float(stack, out_offset, data);
 			break;
 		}*/
diff --git a/intern/cycles/kernel/svm/svm_gradient.h b/intern/cycles/kernel/svm/svm_gradient.h
index a4b3c0583f7..53d7b4f812c 100644
--- a/intern/cycles/kernel/svm/svm_gradient.h
+++ b/intern/cycles/kernel/svm/svm_gradient.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -66,7 +66,7 @@ ccl_device void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
 	float3 co = stack_load_float3(stack, co_offset);
 
 	float f = svm_gradient(co, (NodeGradientType)type);
-	f = clamp(f, 0.0f, 1.0f);
+	f = saturate(f);
 
 	if(stack_valid(fac_offset))
 		stack_store_float(stack, fac_offset, f);
diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h
index 11dfc4f096b..1f2cad60df7 100644
--- a/intern/cycles/kernel/svm/svm_hsv.h
+++ b/intern/cycles/kernel/svm/svm_hsv.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SVM_HSV_H__
@@ -46,7 +46,12 @@ ccl_device void svm_node_hsv(KernelGlobals *kg, ShaderData *sd, float *stack, ui
 	color.y = fac*color.y + (1.0f - fac)*in_color.y;
 	color.z = fac*color.z + (1.0f - fac)*in_color.z;
 
-	if (stack_valid(out_color_offset))
+	/* Clamp color to prevent negative values caused by oversaturation. */
+	color.x = max(color.x, 0.0f);
+	color.y = max(color.y, 0.0f);
+	color.z = max(color.z, 0.0f);
+
+	if(stack_valid(out_color_offset))
 		stack_store_float3(stack, out_color_offset, color);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index a7abeda18e5..caf0b37ba35 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -65,7 +65,7 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 
 	float4 r;
 	int ix, iy, nix, niy;
-	if (interpolation == INTERPOLATION_CLOSEST) {
+	if(interpolation == INTERPOLATION_CLOSEST) {
 		svm_image_texture_frac(x*width, &ix);
 		svm_image_texture_frac(y*height, &iy);
 
@@ -251,9 +251,9 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 		case 95: r = kernel_tex_image_interp(__tex_image_095, x, y); break;
 		case 96: r = kernel_tex_image_interp(__tex_image_096, x, y); break;
 		case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break;
-		case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break;
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+		case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break;
 		case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break;
 		case 100: r = kernel_tex_image_interp(__tex_image_100, x, y); break;
 		case 101: r = kernel_tex_image_interp(__tex_image_101, x, y); break;
@@ -354,6 +354,12 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 
 #endif
 
+/* Remap coordnate from 0..1 box to -1..-1 */
+ccl_device_inline float3 texco_remap_square(float3 co)
+{
+	return (co - make_float3(0.5f, 0.5f, 0.5f)) * 2.0f;
+}
+
 ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	uint id = node.y;
@@ -362,8 +368,20 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta
 	decode_node_uchar4(node.z, &co_offset, &out_offset, &alpha_offset, &srgb);
 
 	float3 co = stack_load_float3(stack, co_offset);
+	float2 tex_co;
 	uint use_alpha = stack_valid(alpha_offset);
-	float4 f = svm_image_texture(kg, id, co.x, co.y, srgb, use_alpha);
+	if(node.w == NODE_IMAGE_PROJ_SPHERE) {
+		co = texco_remap_square(co);
+		tex_co = map_to_sphere(co);
+	}
+	else if(node.w == NODE_IMAGE_PROJ_TUBE) {
+		co = texco_remap_square(co);
+		tex_co = map_to_tube(co);
+	}
+	else {
+		tex_co = make_float2(co.x, co.y);
+	}
+	float4 f = svm_image_texture(kg, id, tex_co.x, tex_co.y, srgb, use_alpha);
 
 	if(stack_valid(out_offset))
 		stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
@@ -374,10 +392,10 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta
 ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	/* get object space normal */
-	float3 N = sd->N;
+	float3 N = ccl_fetch(sd, N);
 
-	N = sd->N;
-	if(sd->object != OBJECT_NONE)
+	N = ccl_fetch(sd, N);
+	if(ccl_fetch(sd, object) != OBJECT_NONE)
 		object_inverse_normal_transform(kg, sd, &N);
 
 	/* project from direction vector to barycentric coordinates in triangles */
@@ -415,17 +433,17 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
 		/* in case of blending, test for mixes between two textures */
 		if(N.z < (1.0f - limit)*(N.y + N.x)) {
 			weight.x = N.x/(N.x + N.y);
-			weight.x = clamp((weight.x - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+			weight.x = saturate((weight.x - 0.5f*(1.0f - blend))/blend);
 			weight.y = 1.0f - weight.x;
 		}
 		else if(N.x < (1.0f - limit)*(N.y + N.z)) {
 			weight.y = N.y/(N.y + N.z);
-			weight.y = clamp((weight.y - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+			weight.y = saturate((weight.y - 0.5f*(1.0f - blend))/blend);
 			weight.z = 1.0f - weight.y;
 		}
 		else if(N.y < (1.0f - limit)*(N.x + N.z)) {
 			weight.x = N.x/(N.x + N.z);
-			weight.x = clamp((weight.x - 0.5f*(1.0f - blend))/blend, 0.0f, 1.0f);
+			weight.x = saturate((weight.x - 0.5f*(1.0f - blend))/blend);
 			weight.z = 1.0f - weight.x;
 		}
 		else {
@@ -435,6 +453,10 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
 			weight.z = ((2.0f - limit)*N.z + (limit - 1.0f))/(2.0f*limit - 1.0f);
 		}
 	}
+	else {
+		/* Desperate mode, no valid choice anyway, fallback to one side.*/
+		weight.x = 1.0f;
+	}
 
 	/* now fetch textures */
 	uint co_offset, out_offset, alpha_offset, srgb;
@@ -459,7 +481,6 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
 		stack_store_float(stack, alpha_offset, f.w);
 }
 
-
 ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	uint id = node.y;
diff --git a/intern/cycles/kernel/svm/svm_invert.h b/intern/cycles/kernel/svm/svm_invert.h
index eb47e9ad4ab..5ce858e2e5d 100644
--- a/intern/cycles/kernel/svm/svm_invert.h
+++ b/intern/cycles/kernel/svm/svm_invert.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -30,7 +30,7 @@ ccl_device void svm_node_invert(ShaderData *sd, float *stack, uint in_fac, uint
 	color.y = invert(color.y, factor);
 	color.z = invert(color.z, factor);
 
-	if (stack_valid(out_color))
+	if(stack_valid(out_color))
 		stack_store_float3(stack, out_color, color);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index da544c63ae0..a235dd35224 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -31,10 +31,10 @@ ccl_device void svm_node_light_path(ShaderData *sd, float *stack, uint type, uin
 		case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break;
 		case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break;
 		case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break;
-		case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break;
-		case NODE_LP_ray_length: info = sd->ray_length; break;
-		case NODE_LP_ray_depth: info = (float)sd->ray_depth; break;
-		case NODE_LP_ray_transparent: info = sd->transparent_depth; break;
+		case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break;
+		case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break;
+		case NODE_LP_ray_depth: info = (float)ccl_fetch(sd, ray_depth); break;
+		case NODE_LP_ray_transparent: info = (float)ccl_fetch(sd, transparent_depth); break;
 	}
 
 	stack_store_float(stack, out_offset, info);
@@ -53,14 +53,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
 
 	switch(type) {
 		case NODE_LIGHT_FALLOFF_QUADRATIC: break;
-		case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break;
-		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break;
+		case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break;
+		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break;
 	}
 
 	float smooth = stack_load_float(stack, smooth_offset);
 
 	if(smooth > 0.0f) {
-		float squared = sd->ray_length*sd->ray_length;
+		float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length);
 		strength *= squared/(smooth + squared);
 	}
 
diff --git a/intern/cycles/kernel/svm/svm_magic.h b/intern/cycles/kernel/svm/svm_magic.h
index b661f5cacf8..ac87c77d719 100644
--- a/intern/cycles/kernel/svm/svm_magic.h
+++ b/intern/cycles/kernel/svm/svm_magic.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/svm/svm_mapping.h b/intern/cycles/kernel/svm/svm_mapping.h
index c9fa8502dd1..0a890545af4 100644
--- a/intern/cycles/kernel/svm/svm_mapping.h
+++ b/intern/cycles/kernel/svm/svm_mapping.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/svm/svm_math.h b/intern/cycles/kernel/svm/svm_math.h
index 1ce9386e40e..d633e54ed8d 100644
--- a/intern/cycles/kernel/svm/svm_math.h
+++ b/intern/cycles/kernel/svm/svm_math.h
@@ -11,99 +11,11 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float svm_math(NodeMath type, float Fac1, float Fac2)
-{
-	float Fac;
-
-	if(type == NODE_MATH_ADD)
-		Fac = Fac1 + Fac2;
-	else if(type == NODE_MATH_SUBTRACT)
-		Fac = Fac1 - Fac2;
-	else if(type == NODE_MATH_MULTIPLY)
-		Fac = Fac1*Fac2;
-	else if(type == NODE_MATH_DIVIDE)
-		Fac = safe_divide(Fac1, Fac2);
-	else if(type == NODE_MATH_SINE)
-		Fac = sinf(Fac1);
-	else if(type == NODE_MATH_COSINE)
-		Fac = cosf(Fac1);
-	else if(type == NODE_MATH_TANGENT)
-		Fac = tanf(Fac1);
-	else if(type == NODE_MATH_ARCSINE)
-		Fac = safe_asinf(Fac1);
-	else if(type == NODE_MATH_ARCCOSINE)
-		Fac = safe_acosf(Fac1);
-	else if(type == NODE_MATH_ARCTANGENT)
-		Fac = atanf(Fac1);
-	else if(type == NODE_MATH_POWER)
-		Fac = safe_powf(Fac1, Fac2);
-	else if(type == NODE_MATH_LOGARITHM)
-		Fac = safe_logf(Fac1, Fac2);
-	else if(type == NODE_MATH_MINIMUM)
-		Fac = fminf(Fac1, Fac2);
-	else if(type == NODE_MATH_MAXIMUM)
-		Fac = fmaxf(Fac1, Fac2);
-	else if(type == NODE_MATH_ROUND)
-		Fac = floorf(Fac1 + 0.5f);
-	else if(type == NODE_MATH_LESS_THAN)
-		Fac = Fac1 < Fac2;
-	else if(type == NODE_MATH_GREATER_THAN)
-		Fac = Fac1 > Fac2;
-	else if(type == NODE_MATH_MODULO)
-		Fac = safe_modulo(Fac1, Fac2);
-    else if(type == NODE_MATH_ABSOLUTE)
-        Fac = fabsf(Fac1);
-	else if(type == NODE_MATH_CLAMP)
-		Fac = clamp(Fac1, 0.0f, 1.0f);
-	else
-		Fac = 0.0f;
-	
-	return Fac;
-}
-
-ccl_device float average_fac(float3 v)
-{
-	return (fabsf(v.x) + fabsf(v.y) + fabsf(v.z))/3.0f;
-}
-
-ccl_device void svm_vector_math(float *Fac, float3 *Vector, NodeVectorMath type, float3 Vector1, float3 Vector2)
-{
-	if(type == NODE_VECTOR_MATH_ADD) {
-		*Vector = Vector1 + Vector2;
-		*Fac = average_fac(*Vector);
-	}
-	else if(type == NODE_VECTOR_MATH_SUBTRACT) {
-		*Vector = Vector1 - Vector2;
-		*Fac = average_fac(*Vector);
-	}
-	else if(type == NODE_VECTOR_MATH_AVERAGE) {
-		*Fac = len(Vector1 + Vector2);
-		*Vector = normalize(Vector1 + Vector2);
-	}
-	else if(type == NODE_VECTOR_MATH_DOT_PRODUCT) {
-		*Fac = dot(Vector1, Vector2);
-		*Vector = make_float3(0.0f, 0.0f, 0.0f);
-	}
-	else if(type == NODE_VECTOR_MATH_CROSS_PRODUCT) {
-		float3 c = cross(Vector1, Vector2);
-		*Fac = len(c);
-		*Vector = normalize(c);
-	}
-	else if(type == NODE_VECTOR_MATH_NORMALIZE) {
-		*Fac = len(Vector1);
-		*Vector = normalize(Vector1);
-	}
-	else {
-		*Fac = 0.0f;
-		*Vector = make_float3(0.0f, 0.0f, 0.0f);
-	}
-}
-
 /* Nodes */
 
 ccl_device void svm_node_math(KernelGlobals *kg, ShaderData *sd, float *stack, uint itype, uint f1_offset, uint f2_offset, int *offset)
diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h
new file mode 100644
index 00000000000..645cbd3fc73
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_math_util.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device float average_fac(float3 v)
+{
+	return (fabsf(v.x) + fabsf(v.y) + fabsf(v.z))/3.0f;
+}
+
+ccl_device void svm_vector_math(float *Fac, float3 *Vector, NodeVectorMath type, float3 Vector1, float3 Vector2)
+{
+	if(type == NODE_VECTOR_MATH_ADD) {
+		*Vector = Vector1 + Vector2;
+		*Fac = average_fac(*Vector);
+	}
+	else if(type == NODE_VECTOR_MATH_SUBTRACT) {
+		*Vector = Vector1 - Vector2;
+		*Fac = average_fac(*Vector);
+	}
+	else if(type == NODE_VECTOR_MATH_AVERAGE) {
+		*Fac = len(Vector1 + Vector2);
+		*Vector = normalize(Vector1 + Vector2);
+	}
+	else if(type == NODE_VECTOR_MATH_DOT_PRODUCT) {
+		*Fac = dot(Vector1, Vector2);
+		*Vector = make_float3(0.0f, 0.0f, 0.0f);
+	}
+	else if(type == NODE_VECTOR_MATH_CROSS_PRODUCT) {
+		float3 c = cross(Vector1, Vector2);
+		*Fac = len(c);
+		*Vector = normalize(c);
+	}
+	else if(type == NODE_VECTOR_MATH_NORMALIZE) {
+		*Fac = len(Vector1);
+		*Vector = normalize(Vector1);
+	}
+	else {
+		*Fac = 0.0f;
+		*Vector = make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
+
+ccl_device float svm_math(NodeMath type, float Fac1, float Fac2)
+{
+	float Fac;
+
+	if(type == NODE_MATH_ADD)
+		Fac = Fac1 + Fac2;
+	else if(type == NODE_MATH_SUBTRACT)
+		Fac = Fac1 - Fac2;
+	else if(type == NODE_MATH_MULTIPLY)
+		Fac = Fac1*Fac2;
+	else if(type == NODE_MATH_DIVIDE)
+		Fac = safe_divide(Fac1, Fac2);
+	else if(type == NODE_MATH_SINE)
+		Fac = sinf(Fac1);
+	else if(type == NODE_MATH_COSINE)
+		Fac = cosf(Fac1);
+	else if(type == NODE_MATH_TANGENT)
+		Fac = tanf(Fac1);
+	else if(type == NODE_MATH_ARCSINE)
+		Fac = safe_asinf(Fac1);
+	else if(type == NODE_MATH_ARCCOSINE)
+		Fac = safe_acosf(Fac1);
+	else if(type == NODE_MATH_ARCTANGENT)
+		Fac = atanf(Fac1);
+	else if(type == NODE_MATH_POWER)
+		Fac = safe_powf(Fac1, Fac2);
+	else if(type == NODE_MATH_LOGARITHM)
+		Fac = safe_logf(Fac1, Fac2);
+	else if(type == NODE_MATH_MINIMUM)
+		Fac = fminf(Fac1, Fac2);
+	else if(type == NODE_MATH_MAXIMUM)
+		Fac = fmaxf(Fac1, Fac2);
+	else if(type == NODE_MATH_ROUND)
+		Fac = floorf(Fac1 + 0.5f);
+	else if(type == NODE_MATH_LESS_THAN)
+		Fac = Fac1 < Fac2;
+	else if(type == NODE_MATH_GREATER_THAN)
+		Fac = Fac1 > Fac2;
+	else if(type == NODE_MATH_MODULO)
+		Fac = safe_modulo(Fac1, Fac2);
+	else if(type == NODE_MATH_ABSOLUTE)
+		Fac = fabsf(Fac1);
+	else if(type == NODE_MATH_CLAMP)
+		Fac = saturate(Fac1);
+	else
+		Fac = 0.0f;
+	
+	return Fac;
+}
+
+ccl_device float3 svm_math_blackbody_color(float t) {
+	/* Calculate color in range 800..12000 using an approximation
+	 * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B
+	 * Max absolute error for RGB is (0.00095, 0.00077, 0.00057),
+	 * which is enough to get the same 8 bit/channel color.
+	 */
+
+	const float rc[6][3] = {
+		{  2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f },
+		{  3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f },
+		{  4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f },
+		{  4.66849800e+03f,  2.85655028e-05f, 1.29075375e-01f },
+		{  4.60124770e+03f,  2.89727618e-05f, 1.48001316e-01f },
+		{  3.78765709e+03f,  9.36026367e-06f, 3.98995841e-01f },
+	};
+
+	const float gc[6][3] = {
+		{ -7.50343014e+02f,  3.15679613e-04f, 4.73464526e-01f },
+		{ -1.00402363e+03f,  1.29189794e-04f, 9.08181524e-01f },
+		{ -1.22075471e+03f,  2.56245413e-05f, 1.20753416e+00f },
+		{ -1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f },
+		{ -1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f },
+		{ -5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f },
+	};
+
+	const float bc[6][4] = {
+		{ 0.0f, 0.0f, 0.0f, 0.0f }, /* zeros should be optimized by compiler */
+		{ 0.0f, 0.0f, 0.0f, 0.0f },
+		{ 0.0f, 0.0f, 0.0f, 0.0f },
+		{ -2.02524603e-11f,  1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f },
+		{ -2.22463426e-13f, -1.55078698e-08f,  3.81675160e-04f, -7.30646033e-01f },
+		{  6.72595954e-13f, -2.73059993e-08f,  4.24068546e-04f, -7.52204323e-01f },
+	};
+
+	if(t >= 12000.0f)
+		return make_float3(0.826270103f, 0.994478524f, 1.56626022f);
+
+	/* Define a macro to reduce stack usage for nvcc */
+#define MAKE_BB_RGB(i) make_float3(\
+		rc[i][0] / t + rc[i][1] * t + rc[i][2],\
+		gc[i][0] / t + gc[i][1] * t + gc[i][2],\
+		((bc[i][0] * t + bc[i][1]) * t + bc[i][2]) * t + bc[i][3])
+
+	if(t >= 6365.0f)
+		return MAKE_BB_RGB(5);
+	if(t >= 3315.0f)
+		return MAKE_BB_RGB(4);
+	if(t >= 1902.0f)
+		return MAKE_BB_RGB(3);
+	if(t >= 1449.0f)
+		return MAKE_BB_RGB(2);
+	if(t >= 1167.0f)
+		return MAKE_BB_RGB(1);
+	if(t >= 965.0f)
+		return MAKE_BB_RGB(0);
+
+#undef MAKE_BB_RGB
+
+	/* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */
+	return make_float3(4.70366907f, 0.0f, 0.0f);
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/svm/svm_mix.h b/intern/cycles/kernel/svm/svm_mix.h
index edc3903865e..6111214acba 100644
--- a/intern/cycles/kernel/svm/svm_mix.h
+++ b/intern/cycles/kernel/svm/svm_mix.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -254,16 +254,16 @@ ccl_device float3 svm_mix_clamp(float3 col)
 {
 	float3 outcol = col;
 
-	outcol.x = clamp(col.x, 0.0f, 1.0f);
-	outcol.y = clamp(col.y, 0.0f, 1.0f);
-	outcol.z = clamp(col.z, 0.0f, 1.0f);
+	outcol.x = saturate(col.x);
+	outcol.y = saturate(col.y);
+	outcol.z = saturate(col.z);
 
 	return outcol;
 }
 
 ccl_device float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2)
 {
-	float t = clamp(fac, 0.0f, 1.0f);
+	float t = saturate(fac);
 
 	switch(type) {
 		case NODE_MIX_BLEND: return svm_mix_blend(t, c1, c2);
diff --git a/intern/cycles/kernel/svm/svm_musgrave.h b/intern/cycles/kernel/svm/svm_musgrave.h
index 61171d6849c..09eba31945e 100644
--- a/intern/cycles/kernel/svm/svm_musgrave.h
+++ b/intern/cycles/kernel/svm/svm_musgrave.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN
  * from "Texturing and Modelling: A procedural approach"
  */
 
-ccl_device_noinline float noise_musgrave_fBm(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves)
+ccl_device_noinline float noise_musgrave_fBm(float3 p, float H, float lacunarity, float octaves)
 {
 	float rmd;
 	float value = 0.0f;
@@ -53,7 +53,7 @@ ccl_device_noinline float noise_musgrave_fBm(float3 p, NodeNoiseBasis basis, flo
  * octaves: number of frequencies in the fBm
  */
 
-ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves)
+ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, float H, float lacunarity, float octaves)
 {
 	float rmd;
 	float value = 1.0f;
@@ -82,7 +82,7 @@ ccl_device_noinline float noise_musgrave_multi_fractal(float3 p, NodeNoiseBasis
  * offset: raises the terrain from `sea level'
  */
 
-ccl_device_noinline float noise_musgrave_hetero_terrain(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset)
+ccl_device_noinline float noise_musgrave_hetero_terrain(float3 p, float H, float lacunarity, float octaves, float offset)
 {
 	float value, increment, rmd;
 	float pwHL = powf(lacunarity, -H);
@@ -117,7 +117,7 @@ ccl_device_noinline float noise_musgrave_hetero_terrain(float3 p, NodeNoiseBasis
  * offset: raises the terrain from `sea level'
  */
 
-ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset, float gain)
+ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, float H, float lacunarity, float octaves, float offset, float gain)
 {
 	float result, signal, weight, rmd;
 	float pwHL = powf(lacunarity, -H);
@@ -154,7 +154,7 @@ ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(float3 p, NodeNois
  * offset: raises the terrain from `sea level'
  */
 
-ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNoiseBasis basis, float H, float lacunarity, float octaves, float offset, float gain)
+ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, float H, float lacunarity, float octaves, float offset, float gain)
 {
 	float result, signal, weight;
 	float pwHL = powf(lacunarity, -H);
@@ -168,7 +168,7 @@ ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNois
 
 	for(i = 1; i < float_to_int(octaves); i++) {
 		p *= lacunarity;
-		weight = clamp(signal * gain, 0.0f, 1.0f);
+		weight = saturate(signal * gain);
 		signal = offset - fabsf(snoise(p));
 		signal *= signal;
 		signal *= weight;
@@ -183,18 +183,16 @@ ccl_device_noinline float noise_musgrave_ridged_multi_fractal(float3 p, NodeNois
 
 ccl_device float svm_musgrave(NodeMusgraveType type, float dimension, float lacunarity, float octaves, float offset, float intensity, float gain, float3 p)
 {
-	NodeNoiseBasis basis = NODE_NOISE_PERLIN;
-
 	if(type == NODE_MUSGRAVE_MULTIFRACTAL)
-		return intensity*noise_musgrave_multi_fractal(p, basis, dimension, lacunarity, octaves);
+		return intensity*noise_musgrave_multi_fractal(p, dimension, lacunarity, octaves);
 	else if(type == NODE_MUSGRAVE_FBM)
-		return intensity*noise_musgrave_fBm(p, basis, dimension, lacunarity, octaves);
+		return intensity*noise_musgrave_fBm(p, dimension, lacunarity, octaves);
 	else if(type == NODE_MUSGRAVE_HYBRID_MULTIFRACTAL)
-		return intensity*noise_musgrave_hybrid_multi_fractal(p, basis, dimension, lacunarity, octaves, offset, gain);
+		return intensity*noise_musgrave_hybrid_multi_fractal(p, dimension, lacunarity, octaves, offset, gain);
 	else if(type == NODE_MUSGRAVE_RIDGED_MULTIFRACTAL)
-		return intensity*noise_musgrave_ridged_multi_fractal(p, basis, dimension, lacunarity, octaves, offset, gain);
+		return intensity*noise_musgrave_ridged_multi_fractal(p, dimension, lacunarity, octaves, offset, gain);
 	else if(type == NODE_MUSGRAVE_HETERO_TERRAIN)
-		return intensity*noise_musgrave_hetero_terrain(p, basis, dimension, lacunarity, octaves, offset);
+		return intensity*noise_musgrave_hetero_terrain(p, dimension, lacunarity, octaves, offset);
 	
 	return 0.0f;
 }
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index 869341c81f4..c77c2a1c482 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -290,40 +290,6 @@ ccl_device_noinline float perlin(float x, float y, float z)
 }
 #endif
 
-#if 0 // unused
-ccl_device_noinline float perlin_periodic(float x, float y, float z, float3 pperiod)
-{
-	int X; float fx = floorfrac(x, &X);
-	int Y; float fy = floorfrac(y, &Y);
-	int Z; float fz = floorfrac(z, &Z);
-
-	int3 p;
-
-	p.x = max(quick_floor(pperiod.x), 1);
-	p.y = max(quick_floor(pperiod.y), 1);
-	p.z = max(quick_floor(pperiod.z), 1);
-
-	float u = fade(fx);
-	float v = fade(fy);
-	float w = fade(fz);
-
-	float result;
-
-	result = nerp (w, nerp (v, nerp (u, grad (phash (X  , Y  , Z  , p), fx	 , fy	 , fz	  ),
-										grad (phash (X+1, Y  , Z  , p), fx-1.0f, fy	 , fz	  )),
-							   nerp (u, grad (phash (X  , Y+1, Z  , p), fx	 , fy-1.0f, fz	  ),
-										grad (phash (X+1, Y+1, Z  , p), fx-1.0f, fy-1.0f, fz	  ))),
-					  nerp (v, nerp (u, grad (phash (X  , Y  , Z+1, p), fx	 , fy	 , fz-1.0f ),
-										grad (phash (X+1, Y  , Z+1, p), fx-1.0f, fy	 , fz-1.0f )),
-							   nerp (u, grad (phash (X  , Y+1, Z+1, p), fx	 , fy-1.0f, fz-1.0f ),
-										grad (phash (X+1, Y+1, Z+1, p), fx-1.0f, fy-1.0f, fz-1.0f ))));
-	float r = scale3(result);
-
-	/* can happen for big coordinates, things even out to 0.0 then anyway */
-	return (isfinite(r))? r: 0.0f;
-}
-#endif
-
 /* perlin noise in range 0..1 */
 ccl_device float noise(float3 p)
 {
@@ -367,20 +333,5 @@ ccl_device ssef cellnoise_color(const ssef& p)
 }
 #endif
 
-#if 0 // unused
-/* periodic perlin noise in range 0..1 */
-ccl_device float pnoise(float3 p, float3 pperiod)
-{
-	float r = perlin_periodic(p.x, p.y, p.z, pperiod);
-	return 0.5f*r + 0.5f;
-}
-
-/* periodic perlin noise in range -1..1 */
-ccl_device float psnoise(float3 p, float3 pperiod)
-{
-	return perlin_periodic(p.x, p.y, p.z, pperiod);
-}
-#endif
-
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index 5d5cfe6ffcc..62ff38cf1c5 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -20,23 +20,22 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline void svm_noise(float3 p, float detail, float distortion, float *fac, float3 *color)
 {
-	NodeNoiseBasis basis = NODE_NOISE_PERLIN;
 	int hard = 0;
 
 	if(distortion != 0.0f) {
 		float3 r, offset = make_float3(13.5f, 13.5f, 13.5f);
 
-		r.x = noise_basis(p + offset, basis) * distortion;
-		r.y = noise_basis(p, basis) * distortion;
-		r.z = noise_basis(p - offset, basis) * distortion;
+		r.x = noise(p + offset) * distortion;
+		r.y = noise(p) * distortion;
+		r.z = noise(p - offset) * distortion;
 
 		p += r;
 	}
 
-	*fac = noise_turbulence(p, basis, detail, hard);
+	*fac = noise_turbulence(p, detail, hard);
 	*color = make_float3(*fac,
-		noise_turbulence(make_float3(p.y, p.x, p.z), basis, detail, hard),
-		noise_turbulence(make_float3(p.y, p.z, p.x), basis, detail, hard));
+		noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard),
+		noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard));
 }
 
 ccl_device void svm_node_tex_noise(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
diff --git a/intern/cycles/kernel/svm/svm_normal.h b/intern/cycles/kernel/svm/svm_normal.h
index 8695031b8b9..53abef71012 100644
--- a/intern/cycles/kernel/svm/svm_normal.h
+++ b/intern/cycles/kernel/svm/svm_normal.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -28,10 +28,10 @@ ccl_device void svm_node_normal(KernelGlobals *kg, ShaderData *sd, float *stack,
 	direction.z = __int_as_float(node1.z);
 	direction = normalize(direction);
 
-	if (stack_valid(out_normal_offset))
+	if(stack_valid(out_normal_offset))
 		stack_store_float3(stack, out_normal_offset, direction);
 
-	if (stack_valid(out_dot_offset))
+	if(stack_valid(out_dot_offset))
 		stack_store_float(stack, out_dot_offset, dot(direction, normalize(normal)));
 }
 
diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h
index 55eee3d24c3..062ab013b1f 100644
--- a/intern/cycles/kernel/svm/svm_ramp.h
+++ b/intern/cycles/kernel/svm/svm_ramp.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SVM_RAMP_H__
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device float4 rgb_ramp_lookup(KernelGlobals *kg, int offset, float f, bool interpolate)
 {
-	f = clamp(f, 0.0f, 1.0f)*(RAMP_TABLE_SIZE-1);
+	f = saturate(f)*(RAMP_TABLE_SIZE-1);
 
 	/* clamp int as well in case of NaN */
 	int i = clamp(float_to_int(f), 0, RAMP_TABLE_SIZE-1);
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
index 111d5d47988..6f51b163756 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -28,7 +28,7 @@ ccl_device void svm_node_combine_hsv(KernelGlobals *kg, ShaderData *sd, float *s
 	/* Combine, and convert back to RGB */
 	float3 color = hsv_to_rgb(make_float3(hue, saturation, value));
 
-	if (stack_valid(color_out))
+	if(stack_valid(color_out))
 		stack_store_float3(stack, color_out, color);
 }
 
@@ -42,11 +42,11 @@ ccl_device void svm_node_separate_hsv(KernelGlobals *kg, ShaderData *sd, float *
 	/* Convert to HSV */
 	color = rgb_to_hsv(color);
 
-	if (stack_valid(hue_out))
+	if(stack_valid(hue_out))
 		stack_store_float(stack, hue_out, color.x);
-	if (stack_valid(saturation_out))
+	if(stack_valid(saturation_out))
 		stack_store_float(stack, saturation_out, color.y);
-	if (stack_valid(value_out))
+	if(stack_valid(value_out))
 		stack_store_float(stack, value_out, color.z);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_vector.h b/intern/cycles/kernel/svm/svm_sepcomb_vector.h
index c8e7e34f87d..63570dd6942 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_vector.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_vector.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -22,7 +22,7 @@ ccl_device void svm_node_combine_vector(ShaderData *sd, float *stack, uint in_of
 {
 	float vector = stack_load_float(stack, in_offset);
 
-	if (stack_valid(out_offset))
+	if(stack_valid(out_offset))
 		stack_store_float(stack, out_offset+vector_index, vector);
 }
 
@@ -30,10 +30,10 @@ ccl_device void svm_node_separate_vector(ShaderData *sd, float *stack, uint ivec
 {
 	float3 vector = stack_load_float3(stack, ivector_offset);
 
-	if (stack_valid(out_offset)) {
-		if (vector_index == 0)
+	if(stack_valid(out_offset)) {
+		if(vector_index == 0)
 			stack_store_float(stack, out_offset, vector.x);
-		else if (vector_index == 1)
+		else if(vector_index == 1)
 			stack_store_float(stack, out_offset, vector.y);
 		else
 			stack_store_float(stack, out_offset, vector.z);
diff --git a/intern/cycles/kernel/svm/svm_sky.h b/intern/cycles/kernel/svm/svm_sky.h
index 500b5146931..4c8e3a32271 100644
--- a/intern/cycles/kernel/svm/svm_sky.h
+++ b/intern/cycles/kernel/svm/svm_sky.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index a17e4a25efe..eebd9bee420 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -11,67 +11,85 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
 /* Texture Coordinate Node */
 
-ccl_device void svm_node_tex_coord(KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_tex_coord(KernelGlobals *kg,
+                                   ShaderData *sd,
+                                   int path_flag,
+                                   float *stack,
+                                   uint4 node,
+                                   int *offset)
 {
 	float3 data;
+	uint type = node.y;
+	uint out_offset = node.z;
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = sd->P;
-			if(sd->object != OBJECT_NONE)
-				object_inverse_position_transform(kg, sd, &data);
+			data = ccl_fetch(sd, P);
+			if(node.w == 0) {
+				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+					object_inverse_position_transform(kg, sd, &data);
+				}
+			}
+			else {
+				Transform tfm;
+				tfm.x = read_node_float(kg, offset);
+				tfm.y = read_node_float(kg, offset);
+				tfm.z = read_node_float(kg, offset);
+				tfm.w = read_node_float(kg, offset);
+				data = transform_point(&tfm, data);
+			}
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = sd->N;
-			if(sd->object != OBJECT_NONE)
+			data = ccl_fetch(sd, N);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(sd->object != OBJECT_NONE)
-				data = transform_point(&tfm, sd->P);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = transform_point(&tfm, ccl_fetch(sd, P));
 			else
-				data = transform_point(&tfm, sd->P + camera_position(kg));
+				data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, sd->ray_P);
+			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P));
 			else
-				data = camera_world_to_ndc(kg, sd, sd->P);
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P));
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != OBJECT_NONE)
-				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
 			else
-				data = sd->I;
+				data = ccl_fetch(sd, I);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, sd->object);
+			data = object_dupli_generated(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, sd->object);
+			data = object_dupli_uv(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = sd->P;
+			data = ccl_fetch(sd, P);
 
 #ifdef __VOLUME__
-			if(sd->object != OBJECT_NONE)
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -81,61 +99,79 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, ShaderData *sd, int path_f
 	stack_store_float3(stack, out_offset, data);
 }
 
-ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           int path_flag,
+                                           float *stack,
+                                           uint4 node,
+                                           int *offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
 	float3 data;
+	uint type = node.y;
+	uint out_offset = node.z;
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = sd->P + sd->dP.dx;
-			if(sd->object != OBJECT_NONE)
-				object_inverse_position_transform(kg, sd, &data);
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
+			if(node.w == 0) {
+				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+					object_inverse_position_transform(kg, sd, &data);
+				}
+			}
+			else {
+				Transform tfm;
+				tfm.x = read_node_float(kg, offset);
+				tfm.y = read_node_float(kg, offset);
+				tfm.z = read_node_float(kg, offset);
+				tfm.w = read_node_float(kg, offset);
+				data = transform_point(&tfm, data);
+			}
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = sd->N;
-			if(sd->object != OBJECT_NONE)
+			data = ccl_fetch(sd, N);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(sd->object != OBJECT_NONE)
-				data = transform_point(&tfm, sd->P + sd->dP.dx);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
 			else
-				data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg));
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
+			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx);
 			else
-				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != OBJECT_NONE)
-				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
 			else
-				data = sd->I;
+				data = ccl_fetch(sd, I);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, sd->object);
+			data = object_dupli_generated(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, sd->object);
+			data = object_dupli_uv(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = sd->P + sd->dP.dx;
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
 
 #ifdef __VOLUME__
-			if(sd->object != OBJECT_NONE)
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -144,65 +180,83 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, ShaderData *sd, in
 
 	stack_store_float3(stack, out_offset, data);
 #else
-	svm_node_tex_coord(kg, sd, stack, type, out_offset);
+	svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
 #endif
 }
 
-ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint type, uint out_offset)
+ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           int path_flag,
+                                           float *stack,
+                                           uint4 node,
+                                           int *offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
 	float3 data;
+	uint type = node.y;
+	uint out_offset = node.z;
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = sd->P + sd->dP.dy;
-			if(sd->object != OBJECT_NONE)
-				object_inverse_position_transform(kg, sd, &data);
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
+			if(node.w == 0) {
+				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+					object_inverse_position_transform(kg, sd, &data);
+				}
+			}
+			else {
+				Transform tfm;
+				tfm.x = read_node_float(kg, offset);
+				tfm.y = read_node_float(kg, offset);
+				tfm.z = read_node_float(kg, offset);
+				tfm.w = read_node_float(kg, offset);
+				data = transform_point(&tfm, data);
+			}
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = sd->N;
-			if(sd->object != OBJECT_NONE)
+			data = ccl_fetch(sd, N);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(sd->object != OBJECT_NONE)
-				data = transform_point(&tfm, sd->P + sd->dP.dy);
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
 			else
-				data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg));
+				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
+			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy);
 			else
-				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
+				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(sd->object != OBJECT_NONE)
-				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
+				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
 			else
-				data = sd->I;
+				data = ccl_fetch(sd, I);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, sd->object);
+			data = object_dupli_generated(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, sd->object);
+			data = object_dupli_uv(kg, ccl_fetch(sd, object));
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = sd->P + sd->dP.dy;
+			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
 
 #ifdef __VOLUME__
-			if(sd->object != OBJECT_NONE)
+			if(ccl_fetch(sd, object) != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -211,7 +265,7 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, ShaderData *sd, in
 
 	stack_store_float3(stack, out_offset, data);
 #else
-	svm_node_tex_coord(kg, sd, stack, type, out_offset);
+	svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
 #endif
 }
 
@@ -227,7 +281,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 
 	if(space == NODE_NORMAL_MAP_TANGENT) {
 		/* tangent space */
-		if(sd->object == OBJECT_NONE) {
+		if(ccl_fetch(sd, object) == OBJECT_NONE) {
 			stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f));
 			return;
 		}
@@ -248,11 +302,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 		float sign = primitive_attribute_float(kg, sd, attr_sign_elem, attr_sign_offset, NULL, NULL);
 		float3 normal;
 
-		if(sd->shader & SHADER_SMOOTH_NORMAL) {
+		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
 			normal = primitive_attribute_float3(kg, sd, attr_normal_elem, attr_normal_offset, NULL, NULL);
 		}
 		else {
-			normal = sd->Ng;
+			normal = ccl_fetch(sd, Ng);
 			object_inverse_normal_transform(kg, sd, &normal);
 		}
 
@@ -283,7 +337,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 
 	if(strength != 1.0f) {
 		strength = max(strength, 0.0f);
-		N = normalize(sd->N + (N - sd->N)*strength);
+		N = normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength);
 	}
 
 	stack_store_float3(stack, normal_offset, N);
@@ -313,7 +367,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 		float3 generated;
 
 		if(attr_offset == ATTR_STD_NOT_FOUND)
-			generated = sd->P;
+			generated = ccl_fetch(sd, P);
 		else
 			generated = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
 
@@ -326,7 +380,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 	}
 
 	object_normal_transform(kg, sd, &tangent);
-	tangent = cross(sd->N, normalize(cross(tangent, sd->N)));
+	tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N))));
 	stack_store_float3(stack, tangent_offset, tangent);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_texture.h b/intern/cycles/kernel/svm/svm_texture.h
index d97c85db36a..dcb00f7dd55 100644
--- a/intern/cycles/kernel/svm/svm_texture.h
+++ b/intern/cycles/kernel/svm/svm_texture.h
@@ -11,266 +11,14 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
-/* Voronoi Distances */
-
-#if 0
-ccl_device float voronoi_distance(NodeDistanceMetric distance_metric, float3 d, float e)
-{
-#if 0
-	if(distance_metric == NODE_VORONOI_DISTANCE_SQUARED)
-#endif
-		return dot(d, d);
-#if 0
-	if(distance_metric == NODE_VORONOI_ACTUAL_DISTANCE)
-		return len(d);
-	if(distance_metric == NODE_VORONOI_MANHATTAN)
-		return fabsf(d.x) + fabsf(d.y) + fabsf(d.z);
-	if(distance_metric == NODE_VORONOI_CHEBYCHEV)
-		return fmaxf(fabsf(d.x), fmaxf(fabsf(d.y), fabsf(d.z)));
-	if(distance_metric == NODE_VORONOI_MINKOVSKY_H)
-		return sqrtf(fabsf(d.x)) + sqrtf(fabsf(d.y)) + sqrtf(fabsf(d.y));
-	if(distance_metric == NODE_VORONOI_MINKOVSKY_4)
-		return sqrtf(sqrtf(dot(d*d, d*d)));
-	if(distance_metric == NODE_VORONOI_MINKOVSKY)
-		return powf(powf(fabsf(d.x), e) + powf(fabsf(d.y), e) + powf(fabsf(d.z), e), 1.0f/e);
-	
-	return 0.0f;
-#endif
-}
-
-/* Voronoi / Worley like */
-ccl_device_inline float4 voronoi_Fn(float3 p, float e, int n1, int n2)
-{
-	float da[4];
-	float3 pa[4];
-	NodeDistanceMetric distance_metric = NODE_VORONOI_DISTANCE_SQUARED;
-
-	/* returns distances in da and point coords in pa */
-	int xx, yy, zz, xi, yi, zi;
-
-	xi = floor_to_int(p.x);
-	yi = floor_to_int(p.y);
-	zi = floor_to_int(p.z);
-
-	da[0] = 1e10f;
-	da[1] = 1e10f;
-	da[2] = 1e10f;
-	da[3] = 1e10f;
-
-	pa[0] = make_float3(0.0f, 0.0f, 0.0f);
-	pa[1] = make_float3(0.0f, 0.0f, 0.0f);
-	pa[2] = make_float3(0.0f, 0.0f, 0.0f);
-	pa[3] = make_float3(0.0f, 0.0f, 0.0f);
-
-	for(xx = xi-1; xx <= xi+1; xx++) {
-		for(yy = yi-1; yy <= yi+1; yy++) {
-			for(zz = zi-1; zz <= zi+1; zz++) {
-				float3 ip = make_float3((float)xx, (float)yy, (float)zz);
-				float3 vp = cellnoise_color(ip);
-				float3 pd = p - (vp + ip);
-				float d = voronoi_distance(distance_metric, pd, e);
-
-				vp += ip;
-
-				if(d < da[0]) {
-					da[3] = da[2];
-					da[2] = da[1];
-					da[1] = da[0];
-					da[0] = d;
-
-					pa[3] = pa[2];
-					pa[2] = pa[1];
-					pa[1] = pa[0];
-					pa[0] = vp;
-				}
-				else if(d < da[1]) {
-					da[3] = da[2];
-					da[2] = da[1];
-					da[1] = d;
-
-					pa[3] = pa[2];
-					pa[2] = pa[1];
-					pa[1] = vp;
-				}
-				else if(d < da[2]) {
-					da[3] = da[2];
-					da[2] = d;
-
-					pa[3] = pa[2];
-					pa[2] = vp;
-				}
-				else if(d < da[3]) {
-					da[3] = d;
-					pa[3] = vp;
-				}
-			}
-		}
-	}
-
-	float4 result = make_float4(pa[n1].x, pa[n1].y, pa[n1].z, da[n1]);
-
-	if(n2 != -1)
-		result = make_float4(pa[n2].x, pa[n2].y, pa[n2].z, da[n2]) - result;
-
-	return result;
-}
-#endif
-
-ccl_device float voronoi_F1_distance(float3 p)
-{
-	/* returns squared distance in da */
-	float da = 1e10f;
-
-#ifndef __KERNEL_SSE2__
-	int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z);
-
-	for (int xx = -1; xx <= 1; xx++) {
-		for (int yy = -1; yy <= 1; yy++) {
-			for (int zz = -1; zz <= 1; zz++) {
-				float3 ip = make_float3(ix + xx, iy + yy, iz + zz);
-				float3 vp = ip + cellnoise_color(ip);
-				float d = len_squared(p - vp);
-				da = min(d, da);
-			}
-		}
-	}
-#else
-	ssef vec_p = load4f(p);
-	ssei xyzi = quick_floor_sse(vec_p);
-
-	for (int xx = -1; xx <= 1; xx++) {
-		for (int yy = -1; yy <= 1; yy++) {
-			for (int zz = -1; zz <= 1; zz++) {
-				ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
-				ssef vp = ip + cellnoise_color(ip);
-				float d = len_squared<1, 1, 1, 0>(vec_p - vp);
-				da = min(d, da);
-			}
-		}
-	}
-#endif
-
-	return da;
-}
-
-ccl_device float3 voronoi_F1_color(float3 p)
-{
-	/* returns color of the nearest point */
-	float da = 1e10f;
-
-#ifndef __KERNEL_SSE2__
-	float3 pa;
-	int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z);
-
-	for (int xx = -1; xx <= 1; xx++) {
-		for (int yy = -1; yy <= 1; yy++) {
-			for (int zz = -1; zz <= 1; zz++) {
-				float3 ip = make_float3(ix + xx, iy + yy, iz + zz);
-				float3 vp = ip + cellnoise_color(ip);
-				float d = len_squared(p - vp);
-
-				if(d < da) {
-					da = d;
-					pa = vp;
-				}
-			}
-		}
-	}
-
-	return cellnoise_color(pa);
-#else
-	ssef pa, vec_p = load4f(p);
-	ssei xyzi = quick_floor_sse(vec_p);
-
-	for (int xx = -1; xx <= 1; xx++) {
-		for (int yy = -1; yy <= 1; yy++) {
-			for (int zz = -1; zz <= 1; zz++) {
-				ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
-				ssef vp = ip + cellnoise_color(ip);
-				float d = len_squared<1, 1, 1, 0>(vec_p - vp);
-
-				if(d < da) {
-					da = d;
-					pa = vp;
-				}
-			}
-		}
-	}
-
-	ssef color = cellnoise_color(pa);
-	return (float3 &)color;
-#endif
-}
-
-#if 0
-ccl_device float voronoi_F1(float3 p) { return voronoi_Fn(p, 0.0f, 0, -1).w; }
-ccl_device float voronoi_F2(float3 p) { return voronoi_Fn(p, 0.0f, 1, -1).w; }
-ccl_device float voronoi_F3(float3 p) { return voronoi_Fn(p, 0.0f, 2, -1).w; }
-ccl_device float voronoi_F4(float3 p) { return voronoi_Fn(p, 0.0f, 3, -1).w; }
-ccl_device float voronoi_F1F2(float3 p) { return voronoi_Fn(p, 0.0f, 0, 1).w; }
-
-ccl_device float voronoi_Cr(float3 p)
-{
-	/* crackle type pattern, just a scale/clamp of F2-F1 */
-	float t = 10.0f*voronoi_F1F2(p);
-	return (t > 1.0f)? 1.0f: t;
-}
-
-ccl_device float voronoi_F1S(float3 p) { return 2.0f*voronoi_F1(p) - 1.0f; }
-ccl_device float voronoi_F2S(float3 p) { return 2.0f*voronoi_F2(p) - 1.0f; }
-ccl_device float voronoi_F3S(float3 p) { return 2.0f*voronoi_F3(p) - 1.0f; }
-ccl_device float voronoi_F4S(float3 p) { return 2.0f*voronoi_F4(p) - 1.0f; }
-ccl_device float voronoi_F1F2S(float3 p) { return 2.0f*voronoi_F1F2(p) - 1.0f; }
-ccl_device float voronoi_CrS(float3 p) { return 2.0f*voronoi_Cr(p) - 1.0f; }
-#endif
-
-/* Noise Bases */
-
-ccl_device float noise_basis(float3 p, NodeNoiseBasis basis)
-{
-	/* Only Perlin enabled for now, others break CUDA compile by making kernel
-	 * too big, with compile using > 4GB, due to everything being inlined. */
-
-#if 0
-	if(basis == NODE_NOISE_PERLIN)
-#endif
-		return noise(p);
-#if 0
-	if(basis == NODE_NOISE_VORONOI_F1)
-		return voronoi_F1S(p);
-	if(basis == NODE_NOISE_VORONOI_F2)
-		return voronoi_F2S(p);
-	if(basis == NODE_NOISE_VORONOI_F3)
-		return voronoi_F3S(p);
-	if(basis == NODE_NOISE_VORONOI_F4)
-		return voronoi_F4S(p);
-	if(basis == NODE_NOISE_VORONOI_F2_F1)
-		return voronoi_F1F2S(p);
-	if(basis == NODE_NOISE_VORONOI_CRACKLE)
-		return voronoi_CrS(p);
-	if(basis == NODE_NOISE_CELL_NOISE)
-		return cellnoise(p);
-	
-	return 0.0f;
-#endif
-}
-
-/* Soft/Hard Noise */
-
-ccl_device float noise_basis_hard(float3 p, NodeNoiseBasis basis, int hard)
-{
-	float t = noise_basis(p, basis);
-	return (hard)? fabsf(2.0f*t - 1.0f): t;
-}
-
 /* Turbulence */
 
-ccl_device_noinline float noise_turbulence(float3 p, NodeNoiseBasis basis, float octaves, int hard)
+ccl_device_noinline float noise_turbulence(float3 p, float octaves, int hard)
 {
 	float fscale = 1.0f;
 	float amp = 1.0f;
@@ -281,7 +29,7 @@ ccl_device_noinline float noise_turbulence(float3 p, NodeNoiseBasis basis, float
 	n = float_to_int(octaves);
 
 	for(i = 0; i <= n; i++) {
-		float t = noise_basis(fscale*p, basis);
+		float t = noise(fscale*p);
 
 		if(hard)
 			t = fabsf(2.0f*t - 1.0f);
@@ -294,7 +42,7 @@ ccl_device_noinline float noise_turbulence(float3 p, NodeNoiseBasis basis, float
 	float rmd = octaves - floorf(octaves);
 
 	if(rmd != 0.0f) {
-		float t = noise_basis(fscale*p, basis);
+		float t = noise(fscale*p);
 
 		if(hard)
 			t = fabsf(2.0f*t - 1.0f);
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index cd38ce4ba9b..33aa5e7c51c 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 #ifndef __SVM_TYPES_H__
@@ -28,6 +28,29 @@ CCL_NAMESPACE_BEGIN
 
 /* Nodes */
 
+/* Known frequencies of used nodes, used for selective nodes compilation
+ * in the kernel. Currently only affects split OpenCL kernel.
+ *
+ * Keep as defines so it's easy to check which nodes are to be compiled
+ * from preprocessor.
+ *
+ * Lower the number of group more often the node is used.
+ */
+#define NODE_GROUP_LEVEL_0    0
+#define NODE_GROUP_LEVEL_1    1
+#define NODE_GROUP_LEVEL_2    2
+#define NODE_GROUP_LEVEL_3    3
+#define NODE_GROUP_LEVEL_MAX  NODE_GROUP_LEVEL_3
+
+#define NODE_FEATURE_VOLUME     (1 << 0)
+#define NODE_FEATURE_HAIR       (1 << 1)
+#define NODE_FEATURE_BUMP       (1 << 2)
+/* TODO(sergey): Consider using something like ((uint)(-1)).
+ * Need to check carefully operand types around usage of this
+ * define first.
+ */
+#define NODE_FEATURE_ALL        (NODE_FEATURE_VOLUME|NODE_FEATURE_HAIR|NODE_FEATURE_BUMP)
+
 typedef enum NodeType {
 	NODE_END = 0,
 	NODE_CLOSURE_BSDF,
@@ -103,8 +126,8 @@ typedef enum NodeType {
 	NODE_NORMAL_MAP,
 	NODE_HAIR_INFO,
 	NODE_UVMAP,
+	NODE_TEX_VOXEL,
 
-	/* Camera ray nodes. */
 	NODE_CAMERA_PATH_ATTRIBUTE,
 	NODE_CAMERA_SAMPLE_PERSPECTIVE,
 	NODE_CAMERA_RAY_OUTPUT,
@@ -262,27 +285,6 @@ typedef enum NodeConvert {
 	NODE_CONVERT_IV
 } NodeConvert;
 
-typedef enum NodeDistanceMetric {
-	NODE_VORONOI_DISTANCE_SQUARED,
-	NODE_VORONOI_ACTUAL_DISTANCE,
-	NODE_VORONOI_MANHATTAN,
-	NODE_VORONOI_CHEBYCHEV,
-	NODE_VORONOI_MINKOVSKY_H,
-	NODE_VORONOI_MINKOVSKY_4,
-	NODE_VORONOI_MINKOVSKY
-} NodeDistanceMetric;
-
-typedef enum NodeNoiseBasis {
-	NODE_NOISE_PERLIN,
-	NODE_NOISE_VORONOI_F1,
-	NODE_NOISE_VORONOI_F2,
-	NODE_NOISE_VORONOI_F3,
-	NODE_NOISE_VORONOI_F4,
-	NODE_NOISE_VORONOI_F2_F1,
-	NODE_NOISE_VORONOI_CRACKLE,
-	NODE_NOISE_CELL_NOISE
-} NodeNoiseBasis;
-
 typedef enum NodeMusgraveType {
 	NODE_MUSGRAVE_MULTIFRACTAL,
 	NODE_MUSGRAVE_FBM,
@@ -340,6 +342,24 @@ typedef enum NodeNormalMapSpace {
 	NODE_NORMAL_MAP_BLENDER_WORLD,
 } NodeNormalMapSpace;
 
+typedef enum NodeImageProjection {
+	NODE_IMAGE_PROJ_FLAT   = 0,
+	NODE_IMAGE_PROJ_BOX    = 1,
+	NODE_IMAGE_PROJ_SPHERE = 2,
+	NODE_IMAGE_PROJ_TUBE   = 3,
+} NodeImageProjection;
+
+typedef enum NodeBumpOffset {
+	NODE_BUMP_OFFSET_CENTER,
+	NODE_BUMP_OFFSET_DX,
+	NODE_BUMP_OFFSET_DY,
+} NodeBumpOffset;
+
+typedef enum NodeTexVoxelSpace {
+	NODE_TEX_VOXEL_SPACE_OBJECT = 0,
+	NODE_TEX_VOXEL_SPACE_WORLD  = 1,
+} NodeTexVoxelSpace;
+
 typedef enum ShaderType {
 	SHADER_TYPE_SURFACE,
 	SHADER_TYPE_VOLUME,
@@ -355,7 +375,6 @@ typedef enum ClosureType {
 	/* Diffuse */
 	CLOSURE_BSDF_DIFFUSE_ID,
 	CLOSURE_BSDF_OREN_NAYAR_ID,
-	CLOSURE_BSDF_WESTIN_SHEEN_ID,
 	CLOSURE_BSDF_DIFFUSE_RAMP_ID,
 	CLOSURE_BSDF_DIFFUSE_TOON_ID,
 
@@ -369,7 +388,6 @@ typedef enum ClosureType {
 	CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID,
 	CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID,
 	CLOSURE_BSDF_ASHIKHMIN_VELVET_ID,
-	CLOSURE_BSDF_WESTIN_BACKSCATTER_ID,
 	CLOSURE_BSDF_PHONG_RAMP_ID,
 	CLOSURE_BSDF_GLOSSY_TOON_ID,
 	CLOSURE_BSDF_HAIR_REFLECTION_ID,
@@ -428,6 +446,7 @@ typedef enum NodePathAttribute {
 #define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
 #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
 #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
+#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
 
 #define CLOSURE_WEIGHT_CUTOFF 1e-5f
 
diff --git a/intern/cycles/kernel/svm/svm_value.h b/intern/cycles/kernel/svm/svm_value.h
index 7beed065288..c1c2b539df3 100644
--- a/intern/cycles/kernel/svm/svm_value.h
+++ b/intern/cycles/kernel/svm/svm_value.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 61d33aeb8cf..4c32130d06d 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 	NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito;
 	
 	Transform tfm;
-	bool is_object = (sd->object != OBJECT_NONE);
+	bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE);
 	bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL);
 	
 	/* From world */
@@ -45,7 +45,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 			else
 				in = transform_point(&tfm, in);
 		}
-		else if (to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_OBJECT && is_object) {
+		else if(to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_OBJECT && is_object) {
 			if(is_direction)
 				object_inverse_dir_transform(kg, sd, &in);
 			else
@@ -54,7 +54,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 	}
 	
 	/* From camera */
-	else if (from == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_CAMERA) {
+	else if(from == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_CAMERA) {
 		if(to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_WORLD || to == NODE_VECTOR_TRANSFORM_CONVERT_SPACE_OBJECT) {
 			tfm = kernel_data.cam.cameratoworld;
 			if(is_direction)
diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h
index 083a2f30e06..d612d7e973f 100644
--- a/intern/cycles/kernel/svm/svm_voronoi.h
+++ b/intern/cycles/kernel/svm/svm_voronoi.h
@@ -11,13 +11,99 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
 
 /* Voronoi */
 
+ccl_device float voronoi_F1_distance(float3 p)
+{
+	/* returns squared distance in da */
+	float da = 1e10f;
+
+#ifndef __KERNEL_SSE2__
+	int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z);
+
+	for(int xx = -1; xx <= 1; xx++) {
+		for(int yy = -1; yy <= 1; yy++) {
+			for(int zz = -1; zz <= 1; zz++) {
+				float3 ip = make_float3(ix + xx, iy + yy, iz + zz);
+				float3 vp = ip + cellnoise_color(ip);
+				float d = len_squared(p - vp);
+				da = min(d, da);
+			}
+		}
+	}
+#else
+	ssef vec_p = load4f(p);
+	ssei xyzi = quick_floor_sse(vec_p);
+
+	for(int xx = -1; xx <= 1; xx++) {
+		for(int yy = -1; yy <= 1; yy++) {
+			for(int zz = -1; zz <= 1; zz++) {
+				ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
+				ssef vp = ip + cellnoise_color(ip);
+				float d = len_squared<1, 1, 1, 0>(vec_p - vp);
+				da = min(d, da);
+			}
+		}
+	}
+#endif
+
+	return da;
+}
+
+ccl_device float3 voronoi_F1_color(float3 p)
+{
+	/* returns color of the nearest point */
+	float da = 1e10f;
+
+#ifndef __KERNEL_SSE2__
+	float3 pa;
+	int ix = floor_to_int(p.x), iy = floor_to_int(p.y), iz = floor_to_int(p.z);
+
+	for(int xx = -1; xx <= 1; xx++) {
+		for(int yy = -1; yy <= 1; yy++) {
+			for(int zz = -1; zz <= 1; zz++) {
+				float3 ip = make_float3(ix + xx, iy + yy, iz + zz);
+				float3 vp = ip + cellnoise_color(ip);
+				float d = len_squared(p - vp);
+
+				if(d < da) {
+					da = d;
+					pa = vp;
+				}
+			}
+		}
+	}
+
+	return cellnoise_color(pa);
+#else
+	ssef pa, vec_p = load4f(p);
+	ssei xyzi = quick_floor_sse(vec_p);
+
+	for(int xx = -1; xx <= 1; xx++) {
+		for(int yy = -1; yy <= 1; yy++) {
+			for(int zz = -1; zz <= 1; zz++) {
+				ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
+				ssef vp = ip + cellnoise_color(ip);
+				float d = len_squared<1, 1, 1, 0>(vec_p - vp);
+
+				if(d < da) {
+					da = d;
+					pa = vp;
+				}
+			}
+		}
+	}
+
+	ssef color = cellnoise_color(pa);
+	return (float3 &)color;
+#endif
+}
+
 ccl_device_noinline float4 svm_voronoi(NodeVoronoiColoring coloring, float3 p)
 {
 	if(coloring == NODE_VORONOI_INTENSITY) {
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
new file mode 100644
index 00000000000..bbb687dfce5
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#if !defined(__KERNEL_GPU__)
+
+/* TODO(sergey): Think of making it more generic volume-type attribute
+ * sampler.
+ */
+ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint4 node,
+                                   int *offset)
+{
+	uint co_offset, density_out_offset, color_out_offset, space;
+	decode_node_uchar4(node.z, &co_offset, &density_out_offset, &color_out_offset, &space);
+	int id = node.y;
+	float3 co = stack_load_float3(stack, co_offset);
+	if(space == NODE_TEX_VOXEL_SPACE_OBJECT) {
+		co = volume_normalized_position(kg, sd, co);
+	}
+	else {
+		kernel_assert(space == NODE_TEX_VOXEL_SPACE_WORLD);
+		Transform tfm;
+		tfm.x = read_node_float(kg, offset);
+		tfm.y = read_node_float(kg, offset);
+		tfm.z = read_node_float(kg, offset);
+		tfm.w = read_node_float(kg, offset);
+		co = transform_point(&tfm, co);
+	}
+	if(co.x < 0.0f || co.y < 0.0f || co.z < 0.0f ||
+	   co.x > 1.0f || co.y > 1.0f || co.z > 1.0f)
+	{
+		if (stack_valid(density_out_offset))
+			stack_store_float(stack, density_out_offset, 0.0f);
+		if (stack_valid(color_out_offset))
+			stack_store_float3(stack, color_out_offset, make_float3(0.0f, 0.0f, 0.0f));
+		return;
+	}
+	float4 r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
+	if (stack_valid(density_out_offset))
+		stack_store_float(stack, density_out_offset, r.w);
+	if (stack_valid(color_out_offset))
+		stack_store_float3(stack, color_out_offset, make_float3(r.x, r.y, r.z));
+}
+
+#endif  /* !defined(__KERNEL_GPU__) */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h
index 7f9081539a4..6eaddaf301c 100644
--- a/intern/cycles/kernel/svm/svm_wave.h
+++ b/intern/cycles/kernel/svm/svm_wave.h
@@ -11,7 +11,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License
+ * limitations under the License.
  */
 
 CCL_NAMESPACE_BEGIN
@@ -28,7 +28,7 @@ ccl_device_noinline float svm_wave(NodeWaveType type, float3 p, float detail, fl
 		n = len(p) * 20.0f;
 	
 	if(distortion != 0.0f)
-		n += distortion * noise_turbulence(p*dscale, NODE_NOISE_PERLIN, detail, 0);
+		n += distortion * noise_turbulence(p*dscale, detail, 0);
 
 	return 0.5f + 0.5f * sinf(n);
 }
diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h
index 9e57c470c0f..57030f3979d 100644
--- a/intern/cycles/kernel/svm/svm_wavelength.h
+++ b/intern/cycles/kernel/svm/svm_wavelength.h
@@ -77,7 +77,7 @@ ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelengt
 	int i = float_to_int(ii);
 	float3 color;
 	
-	if (i < 0 || i >= 80) {
+	if(i < 0 || i >= 80) {
 		color = make_float3(0.0f, 0.0f, 0.0f);
 	}
 	else {
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 660e6e2ca47..30ccd523add 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -34,20 +34,16 @@ CCL_NAMESPACE_BEGIN
 
 /* Wireframe Node */
 
-ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_size, uint out_fac, uint use_pixel_size)
+ccl_device float wireframe(KernelGlobals *kg,
+                           ShaderData *sd,
+                           float size,
+                           int pixel_size,
+                           float3 *P)
 {
-	/* Input Data */
-	float size = stack_load_float(stack, in_size);
-	int pixel_size = (int)use_pixel_size;
-	
-	/* Output */
-	float f = 0.0f;
-
-	/* Calculate wireframe */
 #ifdef __HAIR__
-	if (sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
+	if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)
 #else
-	if (sd->prim != PRIM_NONE)
+	if(ccl_fetch(sd, prim) != PRIM_NONE)
 #endif
 	{
 		float3 Co[3];
@@ -55,45 +51,85 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *sta
 
 		/* Triangles */
 		int np = 3;
-		
-		if(sd->type & PRIMITIVE_TRIANGLE)
-			triangle_vertices(kg, sd->prim, Co);
+
+		if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE)
+			triangle_vertices(kg, ccl_fetch(sd, prim), Co);
 		else
-			motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co);
+			motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co);
 
-		if(!(sd->flag & SD_TRANSFORM_APPLIED)) {
+		if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &Co[0]);
 			object_position_transform(kg, sd, &Co[1]);
 			object_position_transform(kg, sd, &Co[2]);
 		}
-		
+
 		if(pixel_size) {
 			// Project the derivatives of P to the viewing plane defined
 			// by I so we have a measure of how big is a pixel at this point
-			float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I);
-			float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I);
+			float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
+			float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
 			// Take the average of both axis' length
 			pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f;
 		}
-		
+
 		// Use half the width as the neighbor face will render the
 		// other half. And take the square for fast comparison
 		pixelwidth *= 0.5f * size;
 		pixelwidth *= pixelwidth;
-		for (int i = 0; i < np; i++) {
+		for(int i = 0; i < np; i++) {
 			int i2 = i ? i - 1 : np - 1;
-			float3 dir = sd->P - Co[i];
+			float3 dir = *P - Co[i];
 			float3 edge = Co[i] - Co[i2];
 			float3 crs = cross(edge, dir);
 			// At this point dot(crs, crs) / dot(edge, edge) is
 			// the square of area / length(edge) == square of the
 			// distance to the edge.
-			if (dot(crs, crs) < (dot(edge, edge) * pixelwidth))
-				f = 1.0f;
+			if(dot(crs, crs) < (dot(edge, edge) * pixelwidth))
+				return 1.0f;
 		}
 	}
-	
-	if (stack_valid(out_fac))
+	return 0.0f;
+}
+
+ccl_device void svm_node_wireframe(KernelGlobals *kg,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint4 node)
+{
+	uint in_size = node.y;
+	uint out_fac = node.z;
+	uint use_pixel_size, bump_offset;
+	decode_node_uchar4(node.w, &use_pixel_size, &bump_offset, NULL, NULL);
+
+	/* Input Data */
+	float size = stack_load_float(stack, in_size);
+	int pixel_size = (int)use_pixel_size;
+
+	/* Calculate wireframe */
+#ifdef __SPLIT_KERNEL__
+	/* TODO(sergey): This is because sd is actually a global space,
+	 * which makes it difficult to re-use same wireframe() function.
+	 *
+	 * With OpenCL 2.0 it's possible to avoid this change, but for until
+	 * then we'll be living with such an exception.
+	 */
+	float3 P = ccl_fetch(sd, P);
+	float f = wireframe(kg, sd, size, pixel_size, &P);
+#else
+	float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P));
+#endif
+
+	/* TODO(sergey): Think of faster way to calculate derivatives. */
+	if(bump_offset == NODE_BUMP_OFFSET_DX) {
+		float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx);
+	}
+	else if(bump_offset == NODE_BUMP_OFFSET_DY) {
+		float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy);
+	}
+
+	if(stack_valid(out_fac))
 		stack_store_float(stack, out_fac, f);
 }