Merge remote-tracking branch 'origin/master' into soc-2014-fluid

Conflicts: .gitignore intern/cycles/CMakeLists.txt source/blender/blenkernel/intern/smoke.c source/blender/python/intern/bpy_interface.c source/creator/CMakeLists.txt
author: Daniel Genrich <daniel.genrich@gmx.net> 2014-10-23 17:12:28 +0400
committer: Daniel Genrich <daniel.genrich@gmx.net> 2014-10-23 17:12:28 +0400
commit: 9ff1ebed52e0f858a395eeea4caf89304e068b2d (patch)
tree: b05d0f4b229de61b088a128ad412dd7bba347928 /intern/cycles/kernel
parent: a2ed11c6eeab5fab8cb81e32e1c68fdafdd5dbbc (diff)
parent: eaaeae469968c5c78a5d7e6d202f1af00b382a79 (diff)
81 files changed, 4614 insertions, 2506 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index d18f4fa2998..c521e1383a4 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -19,12 +19,13 @@ set(SRC
 set(SRC_HEADERS
 	kernel.h
 	kernel_accumulate.h
+	kernel_bake.h
 	kernel_camera.h
 	kernel_compat_cpu.h
 	kernel_compat_cuda.h
 	kernel_compat_opencl.h
+	kernel_debug.h
 	kernel_differential.h
-	kernel_displace.h
 	kernel_emission.h
 	kernel_film.h
 	kernel_globals.h
@@ -35,6 +36,8 @@ set(SRC_HEADERS
 	kernel_passes.h
 	kernel_path.h
 	kernel_path_state.h
+	kernel_path_surface.h
+	kernel_path_volume.h
 	kernel_projection.h
 	kernel_random.h
 	kernel_shader.h
@@ -58,8 +61,7 @@ set(SRC_CLOSURE_HEADERS
 	closure/bsdf_toon.h
 	closure/bsdf_transparent.h
 	closure/bsdf_util.h
-	closure/bsdf_ward.h
-	closure/bsdf_westin.h
+	closure/bsdf_ashikhmin_shirley.h
 	closure/bsdf_hair.h
 	closure/bssrdf.h
 	closure/emissive.h
@@ -95,8 +97,8 @@ set(SRC_SVM_HEADERS
 	svm/svm_noisetex.h
 	svm/svm_normal.h
 	svm/svm_ramp.h
-	svm/svm_sepcomb_rgb.h
 	svm/svm_sepcomb_hsv.h
+	svm/svm_sepcomb_vector.h
 	svm/svm_sky.h
 	svm/svm_tex_coord.h
 	svm/svm_texture.h
@@ -111,8 +113,10 @@ set(SRC_GEOM_HEADERS
 	geom/geom.h
 	geom/geom_attribute.h
 	geom/geom_bvh.h
+	geom/geom_bvh_shadow.h
 	geom/geom_bvh_subsurface.h
 	geom/geom_bvh_traversal.h
+	geom/geom_bvh_volume.h
 	geom/geom_curve.h
 	geom/geom_motion_curve.h
 	geom/geom_motion_triangle.h
@@ -146,50 +150,69 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
 
 	# warn for other versions
-	if(CUDA_VERSION MATCHES "60")
+	if(CUDA_VERSION MATCHES "65")
 	else()
 		message(WARNING
 			"CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
-			"build may succeed but only CUDA 6.0 is officially supported")
+			"build may succeed but only CUDA 6.5 is officially supported")
 	endif()
 
 	# build for each arch
 	set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
 	set(cuda_cubins)
 
-	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
-		set(cuda_cubin kernel_${arch}.cubin)
+	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
+		if(${experimental})
+			set(cuda_extra_flags "-D__KERNEL_CUDA_EXPERIMENTAL__")
+			set(cuda_cubin kernel_experimental_${arch}.cubin)
+		else()
+			set(cuda_extra_flags "")
+			set(cuda_cubin kernel_${arch}.cubin)
+		endif()
+
+		if(WITH_CYCLES_DEBUG)
+			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
+		else()
+			set(cuda_debug_flags "")
+		endif()
 
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
 		set(cuda_math_flags "--use_fast_math")
 
-		if(CUDA_VERSION LESS 60 AND ${arch} MATCHES "sm_50")
-			message(WARNING "Can't build kernel for CUDA sm_50 architecture, skipping")
-		elseif(CUDA_VERSION LESS 50 AND ${arch} MATCHES "sm_35")
-			message(WARNING "Can't build kernel for CUDA sm_35 architecture, skipping")
-		else()
-			add_custom_command(
-				OUTPUT ${cuda_cubin}
-				COMMAND ${CUDA_NVCC_EXECUTABLE}
-				        -arch=${arch}
-				        -m${CUDA_BITS}
-				        --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
-				        -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
-				        --ptxas-options="-v"
-				        ${cuda_arch_flags}
-				        ${cuda_version_flags}
-				        ${cuda_math_flags}
-				        -I${CMAKE_CURRENT_SOURCE_DIR}/../util
-				        -I${CMAKE_CURRENT_SOURCE_DIR}/svm
-				        -DCCL_NAMESPACE_BEGIN=
-				        -DCCL_NAMESPACE_END=
-				        -DNVCC
-
-				DEPENDS ${cuda_sources})
-
-			delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
-			list(APPEND cuda_cubins ${cuda_cubin})
-		endif()
+		add_custom_command(
+			OUTPUT ${cuda_cubin}
+			COMMAND ${CUDA_NVCC_EXECUTABLE}
+					-arch=${arch}
+					-m${CUDA_BITS}
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
+					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
+					--ptxas-options="-v"
+					${cuda_arch_flags}
+					${cuda_version_flags}
+					${cuda_math_flags}
+					${cuda_extra_flags}
+					${cuda_debug_flags}
+					-I${CMAKE_CURRENT_SOURCE_DIR}/../util
+					-I${CMAKE_CURRENT_SOURCE_DIR}/svm
+					-DCCL_NAMESPACE_BEGIN=
+					-DCCL_NAMESPACE_END=
+					-DNVCC
+
+			DEPENDS ${cuda_sources})
+
+		delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
+		list(APPEND cuda_cubins ${cuda_cubin})
+
+		unset(cuda_extra_flags)
+		unset(cuda_debug_flags)
+	endmacro()
+
+	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
+		# Compile regular kernel
+		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
+
+		# Compile experimental kernel
+		CYCLES_CUDA_KERNEL_ADD(${arch} TRUE)
 	endforeach()
 
 	add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
@@ -213,12 +236,14 @@ if(CXX_HAS_SSE)
 		kernel_sse3.cpp
 		kernel_sse41.cpp
 		kernel_avx.cpp
+		kernel_avx2.cpp
 	)
 
 	set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 	set_source_files_properties(kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 
diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript
index 04e1bad7538..c0d969e24ae 100644
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@@ -30,6 +30,7 @@ import subprocess
 import sys
 import os
 import Blender as B
+import btools
 
 def normpath(path):
     return os.path.abspath(os.path.normpath(path))
@@ -64,49 +65,56 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     closure_dir = os.path.join(source_dir, "../closure")
 
     # get CUDA version
-    nvcc_pipe = subprocess.Popen([nvcc, "--version"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
-    output, erroroutput = nvcc_pipe.communicate()
+    output = btools.get_command_output([nvcc, "--version"])
     cuda_major_minor = re.findall(r'release (\d+).(\d+)', output)[0]
     cuda_version = int(cuda_major_minor[0])*10 + int(cuda_major_minor[1])
 
-    if cuda_version != 60:
-        print("CUDA version %d.%d detected, build may succeed but only CUDA 6.0 is officially supported." % (cuda_version/10, cuda_version%10))
+    if cuda_version != 65:
+        print("CUDA version %d.%d detected, build may succeed but only CUDA 6.5 is officially supported." % (cuda_version/10, cuda_version%10))
 
     # nvcc flags
     nvcc_flags = "-m%s" % (bits)
-    nvcc_flags += " --cubin --ptxas-options=\"-v\""
+    nvcc_flags += " --cubin --ptxas-options=\"-v\" --use_fast_math"
     nvcc_flags += " -D__KERNEL_CUDA_VERSION__=%d" % (cuda_version)
     nvcc_flags += " -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC"
     nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, geom_dir, closure_dir)
 
+    if env['WITH_BF_CYCLES_DEBUG']:
+        nvcc_flags += " -D__KERNEL_DEBUG__"
+
     # dependencies
     dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
     last_cubin_file = None
 
+    configs = (("kernel_%s.cubin", ''),
+               ("kernel_experimental_%s.cubin", ' -D__KERNEL_CUDA_EXPERIMENTAL__'))
+
     # add command for each cuda architecture
     for arch in cuda_archs:
-        if cuda_version < 60 and arch == "sm_50":
-            print("Can't build kernel for CUDA sm_50 architecture, skipping")
-            continue
-
-        cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
-
-        if env['BF_CYCLES_CUDA_ENV']:
-            MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
-            command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, kernel_file, cubin_file)
-        else:
-            command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, kernel_file, cubin_file)
-
-        kernel.Command(cubin_file, 'kernel.cu', command)
-        kernel.Depends(cubin_file, dependencies)
-
-        kernel_binaries.append(cubin_file)
-        
-        if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']:
-            # trick to compile one kernel at a time to reduce memory usage
-            if last_cubin_file:
-                kernel.Depends(cubin_file, last_cubin_file)
-            last_cubin_file = cubin_file
+        for config in configs:
+            # TODO(sergey): Use dict instead ocouple in order to increase readability?
+            name = config[0]
+            extra_flags = config[1]
+
+            cubin_file = os.path.join(build_dir, name % arch)
+            current_flags = nvcc_flags + extra_flags
+
+            if env['BF_CYCLES_CUDA_ENV']:
+                MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
+                command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, current_flags, kernel_file, cubin_file)
+            else:
+                command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, current_flags, kernel_file, cubin_file)
+
+            kernel.Command(cubin_file, 'kernel.cu', command)
+            kernel.Depends(cubin_file, dependencies)
+
+            kernel_binaries.append(cubin_file)
+
+            if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']:
+                # trick to compile one kernel at a time to reduce memory usage
+                if last_cubin_file:
+                    kernel.Depends(cubin_file, last_cubin_file)
+                last_cubin_file = cubin_file
 
 Return('kernel_binaries')
 
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 24b54cd9d9e..7d4783b0f3c 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -23,10 +23,7 @@
 #include "../closure/bsdf_reflection.h"
 #include "../closure/bsdf_refraction.h"
 #include "../closure/bsdf_transparent.h"
-#ifdef __ANISOTROPIC__
-#include "../closure/bsdf_ward.h"
-#endif
-#include "../closure/bsdf_westin.h"
+#include "../closure/bsdf_ashikhmin_shirley.h"
 #include "../closure/bsdf_toon.h"
 #include "../closure/bsdf_hair.h"
 #ifdef __SUBSURFACE__
@@ -83,21 +80,22 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-			label = bsdf_microfacet_ggx_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
+		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-			label = bsdf_microfacet_beckmann_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
-#ifdef __ANISOTROPIC__
-		case CLOSURE_BSDF_WARD_ID:
-			label = bsdf_ward_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
+		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
+			label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
-#endif
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
 			label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
@@ -110,14 +108,6 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader
 			label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
-		case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
-			label = bsdf_westin_backscatter_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
-				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
-			break;
-		case CLOSURE_BSDF_WESTIN_SHEEN_ID:
-			label = bsdf_westin_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
-				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
-			break;
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 			label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
@@ -178,18 +168,19 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 				eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
 				eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
+			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
 				eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
-#ifdef __ANISOTROPIC__
-			case CLOSURE_BSDF_WARD_ID:
-				eval = bsdf_ward_eval_reflect(sc, sd->I, omega_in, pdf);
+			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
+			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
+				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
-#endif
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
 				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
@@ -199,12 +190,6 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
 				eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
-			case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
-				eval = bsdf_westin_backscatter_eval_reflect(sc, sd->I, omega_in, pdf);
-				break;
-			case CLOSURE_BSDF_WESTIN_SHEEN_ID:
-				eval = bsdf_westin_sheen_eval_reflect(sc, sd->I, omega_in, pdf);
-				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 				eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
@@ -245,18 +230,19 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 				eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
 				eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
+			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
 				eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
-#ifdef __ANISOTROPIC__
-			case CLOSURE_BSDF_WARD_ID:
-				eval = bsdf_ward_eval_transmit(sc, sd->I, omega_in, pdf);
+			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
+			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
+				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
-#endif
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
 				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
@@ -266,12 +252,6 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
 				eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
-			case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
-				eval = bsdf_westin_backscatter_eval_transmit(sc, sd->I, omega_in, pdf);
-				break;
-			case CLOSURE_BSDF_WESTIN_SHEEN_ID:
-				eval = bsdf_westin_sheen_eval_transmit(sc, sd->I, omega_in, pdf);
-				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 				eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
@@ -330,18 +310,19 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 			bsdf_transparent_blur(sc, roughness);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
 			bsdf_microfacet_ggx_blur(sc, roughness);
 			break;
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
+		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
 			bsdf_microfacet_beckmann_blur(sc, roughness);
 			break;
-#ifdef __ANISOTROPIC__
-		case CLOSURE_BSDF_WARD_ID:
-			bsdf_ward_blur(sc, roughness);
+		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
+		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
+			bsdf_ashikhmin_shirley_blur(sc, roughness);
 			break;
-#endif
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
 			bsdf_ashikhmin_velvet_blur(sc, roughness);
 			break;
@@ -351,12 +332,6 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 		case CLOSURE_BSDF_GLOSSY_TOON_ID:
 			bsdf_glossy_toon_blur(sc, roughness);
 			break;
-		case CLOSURE_BSDF_WESTIN_BACKSCATTER_ID:
-			bsdf_westin_backscatter_blur(sc, roughness);
-			break;
-		case CLOSURE_BSDF_WESTIN_SHEEN_ID:
-			bsdf_westin_sheen_blur(sc, roughness);
-			break;
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
 			bsdf_hair_reflection_blur(sc, roughness);
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
new file mode 100644
index 00000000000..ad7864cb8ea
--- /dev/null
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -0,0 +1,210 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#ifndef __BSDF_ASHIKHMIN_SHIRLEY_H__
+#define __BSDF_ASHIKHMIN_SHIRLEY_H__
+
+/*
+ASHIKHMIN SHIRLEY BSDF
+
+Implementation of
+Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000)
+
+The Fresnel factor is missing to get a separable bsdf (intensity*color), as is
+the case with all other microfacet-based BSDF implementations in Cycles.
+
+Other than that, the implementation directly follows the paper.
+*/
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device int bsdf_ashikhmin_shirley_setup(ShaderClosure *sc)
+{
+	/* store roughness. could already convert to exponent to save some cycles
+	 * in eval, but this is more consistent with other bsdfs and shader_blur. */
+	sc->data0 = clamp(sc->data0, 1e-4f, 1.0f);
+	sc->data1 = sc->data0;
+
+	sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID;
+	return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY;
+}
+
+ccl_device int bsdf_ashikhmin_shirley_aniso_setup(ShaderClosure *sc)
+{
+	/* store roughness. could already convert to exponent to save some cycles
+	 * in eval, but this is more consistent with other bsdfs and shader_blur. */
+	sc->data0 = clamp(sc->data0, 1e-4f, 1.0f);
+	sc->data1 = clamp(sc->data1, 1e-4f, 1.0f);
+
+	sc->type = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID;
+	return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_GLOSSY;
+}
+
+ccl_device void bsdf_ashikhmin_shirley_blur(ShaderClosure *sc, float roughness)
+{
+	sc->data0 = fmaxf(roughness, sc->data0); /* clamp roughness */
+	sc->data1 = fmaxf(roughness, sc->data1);
+}
+
+ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float roughness)
+{
+	return 2.0f / (roughness*roughness) - 2.0f;
+}
+
+ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+{
+	float3 N = sc->N;
+
+	float NdotI = dot(N, I);           /* in Cycles/OSL convention I is omega_out    */
+	float NdotO = dot(N, omega_in);    /* and consequently we use for O omaga_in ;)  */
+
+	float out = 0.0f;
+
+	if (NdotI > 0.0f && NdotO > 0.0f) {
+		NdotI = fmaxf(NdotI, 1e-6f);
+		NdotO = fmaxf(NdotO, 1e-6f);
+		float3 H = normalize(omega_in + I);
+		float HdotI = fmaxf(fabsf(dot(H, I)), 1e-6f);
+		float HdotN = fmaxf(dot(H, N), 1e-6f);
+
+		float pump = 1.0f / fmaxf(1e-6f, (HdotI*fmaxf(NdotO, NdotI))); /* pump from original paper (first derivative disc., but cancels the HdotI in the pdf nicely) */
+		/*float pump = 1.0f / fmaxf(1e-4f, ((NdotO + NdotI) * (NdotO*NdotI))); */ /* pump from d-brdf paper */
+
+		float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0);
+		float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1);
+
+		if (n_x == n_y) {  /* => isotropic case */
+			float e = n_x;
+			float lobe = powf(HdotN, e);
+			float norm = (n_x + 1.0f) / (8.0f * M_PI_F);
+
+			out = NdotO * norm * lobe * pump;
+			*pdf = norm * lobe / HdotI; /* this is p_h / 4(H.I)  (conversion from 'wh measure' to 'wi measure', eq. 8 in paper) */
+		}
+		else {             /* => ANisotropic case */
+			float3 X, Y;
+			make_orthonormals_tangent(N, sc->T, &X, &Y);
+
+			float HdotX = dot(H, X);
+			float HdotY = dot(H, Y);
+			float e = (n_x * HdotX*HdotX + n_y * HdotY*HdotY) / (1.0f - HdotN*HdotN);
+			float lobe = powf(HdotN, e);
+			float norm = sqrtf((n_x + 1.0f)*(n_y + 1.0f)) / (8.0f * M_PI_F);
+			
+			out = NdotO * norm * lobe * pump;
+			*pdf = norm * lobe / HdotI;
+		}
+	}
+
+	return make_float3(out, out, out);
+}
+
+ccl_device float3 bsdf_ashikhmin_shirley_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+{
+	return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device_inline void bsdf_ashikhmin_shirley_sample_first_quadrant(float n_x, float n_y, float randu, float randv, float *phi, float *cos_theta)
+{
+	*phi = atanf(sqrtf((n_x + 1.0f) / (n_y + 1.0f)) * tanf(M_PI_2_F * randu));
+	float cos_phi = cosf(*phi);
+	float sin_phi = sinf(*phi);
+	*cos_theta = powf(randv, 1.0f / (n_x * cos_phi*cos_phi + n_y * sin_phi*sin_phi + 1.0f));
+}
+
+ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+{
+	float3 N = sc->N;
+
+	float NdotI = dot(N, I);
+	if (NdotI > 0.0f) {
+
+		float n_x = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data0);
+		float n_y = bsdf_ashikhmin_shirley_roughness_to_exponent(sc->data1);
+
+		/* get x,y basis on the surface for anisotropy */
+		float3 X, Y;
+
+		if(n_x == n_y)
+			make_orthonormals(N, &X, &Y);
+		else
+			make_orthonormals_tangent(N, sc->T, &X, &Y);
+
+		/* sample spherical coords for h in tangent space */
+		float phi;
+		float cos_theta;
+		if (n_x == n_y) {  /* => simple isotropic sampling */
+			phi = M_2PI_F * randu;
+			cos_theta = powf(randv, 1.0f / (n_x + 1.0f));
+		}
+		else {             /* => more complex anisotropic sampling */
+			if (randu < 0.25f) {      /* first quadrant */
+				float remapped_randu = 4.0f * randu;
+				bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
+			}
+			else if (randu < 0.5f) {  /* second quadrant */
+				float remapped_randu = 4.0f * (.5f - randu);
+				bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
+				phi = M_PI_F - phi;
+			}
+			else if (randu < 0.75f) { /* third quadrant */
+				float remapped_randu = 4.0f * (randu - 0.5f);
+				bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
+				phi = M_PI_F + phi;
+			}
+			else {                   /* fourth quadrant */
+				float remapped_randu = 4.0f * (1.0f - randu);
+				bsdf_ashikhmin_shirley_sample_first_quadrant(n_x, n_y, remapped_randu, randv, &phi, &cos_theta);
+				phi = 2.0f * M_PI_F - phi;
+			}
+		}
+
+		/* get half vector in tangent space */
+		float sin_theta = sqrtf(fmaxf(0.0f, 1.0f - cos_theta*cos_theta));
+		float cos_phi = cosf(phi);
+		float sin_phi = sinf(phi); /* no sqrt(1-cos^2) here b/c it causes artifacts */
+		float3 h = make_float3(
+			sin_theta * cos_phi,
+			sin_theta * sin_phi,
+			cos_theta
+			);
+
+		/* half vector to world space */
+		float3 H = h.x*X + h.y*Y + h.z*N;
+		float HdotI = dot(H, I);
+		if (HdotI < 0.0f) H = -H;
+
+		/* reflect I on H to get omega_in */
+		*omega_in = -I + (2.0f * HdotI) * H;
+
+		/* leave the rest to eval_reflect */
+		/* (could maybe optimize a few things by manual inlining, but I doubt it would make much difference) */
+		*eval = bsdf_ashikhmin_shirley_eval_reflect(sc, I, *omega_in, pdf);
+
+#ifdef __RAY_DIFFERENTIALS__
+		/* just do the reflection thing for now */
+		*domega_in_dx = (2.0f * dot(N, dIdx)) * N - dIdx;
+		*domega_in_dy = (2.0f * dot(N, dIdy)) * N - dIdy;
+#endif
+	}
+
+	return LABEL_REFLECT | LABEL_GLOSSY;
+}
+
+
+CCL_NAMESPACE_END
+
+#endif /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index 19cdb773255..e0b5454592b 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -63,7 +63,7 @@ ccl_device int bsdf_hair_transmission_setup(ShaderClosure *sc)
 ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 #ifdef __HAIR__
-	float offset = sc->offset;
+	float offset = sc->data2;
 	float3 Tg = sc->T;
 #else
 	float offset = 0.0f;
@@ -120,7 +120,7 @@ ccl_device float3 bsdf_hair_reflection_eval_transmit(const ShaderClosure *sc, co
 ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
 #ifdef __HAIR__
-	float offset = sc->offset;
+	float offset = sc->data2;
 	float3 Tg = sc->T;
 #else
 	float offset = 0.0f;
@@ -166,7 +166,7 @@ ccl_device float3 bsdf_hair_transmission_eval_transmit(const ShaderClosure *sc,
 ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 #ifdef __HAIR__
-	float offset = sc->offset;
+	float offset = sc->data2;
 	float3 Tg = sc->T;
 #else
 	float offset = 0.0f;
@@ -221,7 +221,7 @@ ccl_device int bsdf_hair_reflection_sample(const ShaderClosure *sc, float3 Ng, f
 ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
 #ifdef __HAIR__
-	float offset = sc->offset;
+	float offset = sc->data2;
 	float3 Tg = sc->T;
 #else
 	float offset = 0.0f;
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 1ec35e444fe..8737b0e2d94 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -35,20 +35,293 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* GGX */
+/* Approximate erf and erfinv implementations.
+ * Implementation comes straight from Wikipedia:
+ *
+ * http://en.wikipedia.org/wiki/Error_function
+ *
+ * Some constants are baked into the code.
+ */
+
+ccl_device_inline float approx_erff_do(float x)
+{
+	/* Such a clamp doesn't give much distortion to the output value
+	 * and gives quite a few of the speedup.
+	 */
+	if(x > 3.0f) {
+		return 1.0f;
+	}
+	float t = 1.0f / (1.0f + 0.47047f*x);
+	return  (1.0f -
+	         t*(0.3480242f + t*(-0.0958798f + t*0.7478556f)) * expf(-x*x));
+}
+
+ccl_device_inline float approx_erff(float x)
+{
+	if(x >= 0.0f) {
+		return approx_erff_do(x);
+	}
+	else {
+		return -approx_erff_do(-x);
+	}
+}
+
+ccl_device_inline float approx_erfinvf_do(float x)
+{
+	if(x <= 0.7f) {
+		const float x2 = x * x;
+		const float a1 =  0.886226899f;
+		const float a2 = -1.645349621f;
+		const float a3 =  0.914624893f;
+		const float a4 = -0.140543331f;
+		const float b1 = -2.118377725f;
+		const float b2 =  1.442710462f;
+		const float b3 = -0.329097515f;
+		const float b4 =  0.012229801f;
+		return x * (((a4 * x2 + a3) * x2 + a2) * x2 + a1) /
+		          ((((b4 * x2 + b3) * x2 + b2) * x2 + b1) * x2 + 1.0f);
+	}
+	else {
+		const float c1 = -1.970840454f;
+		const float c2 = -1.624906493f;
+		const float c3 =  3.429567803f;
+		const float c4 =  1.641345311f;
+		const float d1 =  3.543889200f;
+		const float d2 =  1.637067800f;
+		const float z = sqrtf(-logf((1.0f - x) * 0.5f));
+		return (((c4 * z + c3) * z + c2) * z + c1) /
+		        ((d2 * z + d1) * z + 1.0f);
+	}
+}
+
+ccl_device_inline float approx_erfinvf(float x)
+{
+	if(x >= 0.0f) {
+		return approx_erfinvf_do(x);
+	}
+	else {
+		return -approx_erfinvf_do(-x);
+	}
+}
+
+/* Beckmann and GGX microfacet importance sampling from:
+ * 
+ * Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals.
+ * E. Heitz and E. d'Eon, EGSR 2014 */
+
+ccl_device_inline void microfacet_beckmann_sample_slopes(
+	KernelGlobals *kg,
+	const float cos_theta_i, const float sin_theta_i,
+	float randu, float randv, float *slope_x, float *slope_y,
+	float *G1i)
+{
+	/* special case (normal incidence) */
+	if(cos_theta_i >= 0.99999f) {
+		const float r = sqrtf(-logf(randu));
+		const float phi = M_2PI_F * randv;
+		*slope_x = r * cosf(phi);
+		*slope_y = r * sinf(phi);
+		*G1i = 1.0f;
+		return;
+	}
+
+	/* precomputations */
+	const float tan_theta_i = sin_theta_i/cos_theta_i;
+	const float inv_a = tan_theta_i;
+	const float a = 1.0f/inv_a;
+	const float erf_a = approx_erff(a);
+	const float exp_a2 = expf(-a*a);
+	const float SQRT_PI_INV = 0.56418958354f;
+	const float Lambda = 0.5f*(erf_a - 1.0f) + (0.5f*SQRT_PI_INV)*(exp_a2*inv_a);
+	const float G1 = 1.0f/(1.0f + Lambda); /* masking */
+
+	*G1i = G1;
+
+#if 0
+	const float C = 1.0f - G1 * erf_a;
+
+	/* sample slope X */
+	if(randu < C) {
+		/* rescale randu */
+		randu = randu / C;
+		const float w_1 = 0.5f * SQRT_PI_INV * sin_theta_i * exp_a2;
+		const float w_2 = cos_theta_i * (0.5f - 0.5f*erf_a);
+		const float p = w_1 / (w_1 + w_2);
+
+		if(randu < p) {
+			randu = randu / p;
+			*slope_x = -sqrtf(-logf(randu*exp_a2));
+		}
+		else {
+			randu = (randu - p) / (1.0f - p);
+			*slope_x = approx_erfinvf(randu - 1.0f - randu*erf_a);
+		}
+	}
+	else {
+		/* rescale randu */
+		randu = (randu - C) / (1.0f - C);
+		*slope_x = approx_erfinvf((-1.0f + 2.0f*randu)*erf_a);
+
+		const float p = (-(*slope_x)*sin_theta_i + cos_theta_i) / (2.0f*cos_theta_i);
+
+		if(randv > p) {
+			*slope_x = -(*slope_x);
+			randv = (randv - p) / (1.0f - p);
+		}
+		else
+			randv = randv / p;
+	}
+
+	/* sample slope Y */
+	*slope_y = approx_erfinvf(2.0f*randv - 1.0f);
+#else
+	/* use precomputed table, because it better preserves stratification
+	 * of the random number pattern */
+	int beckmann_table_offset = kernel_data.tables.beckmann_offset;
+
+	*slope_x = lookup_table_read_2D(kg, randu, cos_theta_i,
+		beckmann_table_offset, BECKMANN_TABLE_SIZE, BECKMANN_TABLE_SIZE);
+	*slope_y = approx_erfinvf(2.0f*randv - 1.0f);
+#endif
+
+}
+
+ccl_device_inline void microfacet_ggx_sample_slopes(
+	const float cos_theta_i, const float sin_theta_i,
+	float randu, float randv, float *slope_x, float *slope_y,
+	float *G1i)
+{
+	/* special case (normal incidence) */
+	if(cos_theta_i >= 0.99999f) {
+		const float r = sqrtf(randu/(1.0f - randu));
+		const float phi = M_2PI_F * randv;
+		*slope_x = r * cosf(phi);
+		*slope_y = r * sinf(phi);
+		*G1i = 1.0f;
+
+		return;
+	}
+
+	/* precomputations */
+	const float tan_theta_i = sin_theta_i/cos_theta_i;
+	const float G1_inv = 0.5f * (1.0f + safe_sqrtf(1.0f + tan_theta_i*tan_theta_i));
+
+	*G1i = 1.0f/G1_inv;
+
+	/* sample slope_x */
+	const float A = 2.0f*randu*G1_inv - 1.0f;
+	const float AA = A*A;
+	const float tmp = 1.0f/(AA - 1.0f);
+	const float B = tan_theta_i;
+	const float BB = B*B;
+	const float D = safe_sqrtf(BB*(tmp*tmp) - (AA - BB)*tmp);
+	const float slope_x_1 = B*tmp - D;
+	const float slope_x_2 = B*tmp + D;
+	*slope_x = (A < 0.0f || slope_x_2*tan_theta_i > 1.0f)? slope_x_1: slope_x_2;
+
+	/* sample slope_y */
+	float S;
+
+	if(randv > 0.5f) {
+		S = 1.0f;
+		randv = 2.0f*(randv - 0.5f);
+	}
+	else {
+		S = -1.0f;
+		randv = 2.0f*(0.5f - randv);
+	}
+
+	const float z = (randv*(randv*(randv*0.27385f - 0.73369f) + 0.46341f)) / (randv*(randv*(randv*0.093073f + 0.309420f) - 1.000000f) + 0.597999f);
+	*slope_y = S * z * safe_sqrtf(1.0f + (*slope_x)*(*slope_x));
+}
+
+ccl_device_inline float3 microfacet_sample_stretched(
+	KernelGlobals *kg, const float3 omega_i,
+	const float alpha_x, const float alpha_y,
+	const float randu, const float randv,
+	bool beckmann, float *G1i)
+{
+	/* 1. stretch omega_i */
+	float3 omega_i_ = make_float3(alpha_x * omega_i.x, alpha_y * omega_i.y, omega_i.z);
+	omega_i_ = normalize(omega_i_);
+
+	/* get polar coordinates of omega_i_ */
+	float costheta_ = 1.0f;
+	float sintheta_ = 0.0f;
+	float cosphi_ = 1.0f;
+	float sinphi_ = 0.0f;
+
+	if(omega_i_.z < 0.99999f) {
+		costheta_ = omega_i_.z;
+		sintheta_ = safe_sqrtf(1.0f - costheta_*costheta_);
+
+		float invlen = 1.0f/sintheta_;
+		cosphi_ = omega_i_.x * invlen;
+		sinphi_ = omega_i_.y * invlen;
+	}
+
+	/* 2. sample P22_{omega_i}(x_slope, y_slope, 1, 1) */
+	float slope_x, slope_y;
+
+	if(beckmann) {
+		microfacet_beckmann_sample_slopes(kg, costheta_, sintheta_,
+			randu, randv, &slope_x, &slope_y, G1i);
+	}
+	else {
+		microfacet_ggx_sample_slopes(costheta_, sintheta_,
+			randu, randv, &slope_x, &slope_y, G1i);
+	}
+
+	/* 3. rotate */
+	float tmp = cosphi_*slope_x - sinphi_*slope_y;
+	slope_y = sinphi_*slope_x + cosphi_*slope_y;
+	slope_x = tmp;
+
+	/* 4. unstretch */
+	slope_x = alpha_x * slope_x;
+	slope_y = alpha_y * slope_y;
+
+	/* 5. compute normal */
+	return normalize(make_float3(-slope_x, -slope_y, 1.0f));
+} 
+
+/* GGX microfacet with Smith shadow-masking from:
+ *
+ * Microfacet Models for Refraction through Rough Surfaces
+ * B. Walter, S. R. Marschner, H. Li, K. E. Torrance, EGSR 2007
+ *
+ * Anisotropic from:
+ *
+ * Understanding the Masking-Shadowing Function in Microfacet-Based BRDFs
+ * E. Heitz, Research Report 2014
+ *
+ * Anisotropy is only supported for reflection currently, but adding it for
+ * transmission is just a matter of copying code from reflection if needed. */
 
 ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ag */
+	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data1 = sc->data0; /* alpha_y */
 	
 	sc->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
 }
 
+ccl_device int bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc)
+{
+	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */
+	
+	sc->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
+
+	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+}
+
 ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ag */
+	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data1 = sc->data0; /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 
@@ -57,136 +330,250 @@ ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc)
 
 ccl_device void bsdf_microfacet_ggx_blur(ShaderClosure *sc, float roughness)
 {
-	sc->data0 = fmaxf(roughness, sc->data0); /* m_ag */
+	sc->data0 = fmaxf(roughness, sc->data0); /* alpha_x */
+	sc->data1 = fmaxf(roughness, sc->data1); /* alpha_y */
 }
 
 ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float m_ag = max(sc->data0, 1e-4f);
+	float alpha_x = sc->data0;
+	float alpha_y = sc->data1;
 	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 	float3 N = sc->N;
 
-	if(m_refractive || m_ag <= 1e-4f)
-		return make_float3 (0, 0, 0);
+	if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
+		return make_float3(0, 0, 0);
+
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
+
 	if(cosNI > 0 && cosNO > 0) {
-		// get half vector
-		float3 Hr = normalize(omega_in + I);
-		// eq. 20: (F*G*D)/(4*in*on)
-		// eq. 33: first we calculate D(m) with m=Hr:
-		float alpha2 = m_ag * m_ag;
-		float cosThetaM = dot(N, Hr);
-		float cosThetaM2 = cosThetaM * cosThetaM;
-		float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
-		float cosThetaM4 = cosThetaM2 * cosThetaM2;
-		float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
-		// eq. 34: now calculate G1(i,m) and G1(o,m)
-		float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
-		float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); 
+		/* get half vector */
+		float3 m = normalize(omega_in + I);
+		float alpha2 = alpha_x * alpha_y;
+		float D, G1o, G1i;
+
+		if(alpha_x == alpha_y) {
+			/* isotropic
+			 * eq. 20: (F*G*D)/(4*in*on)
+			 * eq. 33: first we calculate D(m) */
+			float cosThetaM = dot(N, m);
+			float cosThetaM2 = cosThetaM * cosThetaM;
+			float cosThetaM4 = cosThetaM2 * cosThetaM2;
+			float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
+			D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+
+			/* eq. 34: now calculate G1(i,m) and G1(o,m) */
+			G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
+			G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); 
+		}
+		else {
+			/* anisotropic */
+			float3 X, Y, Z = N;
+			make_orthonormals_tangent(Z, sc->T, &X, &Y);
+
+			/* distribution */
+			float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m));
+			float slope_x = -local_m.x/(local_m.z*alpha_x);
+			float slope_y = -local_m.y/(local_m.z*alpha_y);
+			float slope_len = 1 + slope_x*slope_x + slope_y*slope_y;
+
+			float cosThetaM = local_m.z;
+			float cosThetaM2 = cosThetaM * cosThetaM;
+			float cosThetaM4 = cosThetaM2 * cosThetaM2;
+
+			D = 1 / ((slope_len * slope_len) * M_PI_F * alpha2 * cosThetaM4);
+
+			/* G1(i,m) and G1(o,m) */
+			float tanThetaO2 = (1 - cosNO * cosNO) / (cosNO * cosNO);
+			float cosPhiO = dot(I, X);
+			float sinPhiO = dot(I, Y);
+
+			float alphaO2 = (cosPhiO*cosPhiO)*(alpha_x*alpha_x) + (sinPhiO*sinPhiO)*(alpha_y*alpha_y);
+			alphaO2 /= cosPhiO*cosPhiO + sinPhiO*sinPhiO;
+
+			G1o = 2 / (1 + safe_sqrtf(1 + alphaO2 * tanThetaO2));
+
+			float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI);
+			float cosPhiI = dot(omega_in, X);
+			float sinPhiI = dot(omega_in, Y);
+
+			float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y);
+			alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI;
+
+			G1i = 2 / (1 + safe_sqrtf(1 + alphaI2 * tanThetaI2));
+		}
+
 		float G = G1o * G1i;
-		float out = (G * D) * 0.25f / cosNO;
-		// eq. 24
-		float pm = D * cosThetaM;
-		// convert into pdf of the sampled direction
-		// eq. 38 - but see also:
-		// eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf
-		*pdf = pm * 0.25f / dot(Hr, I);
-		return make_float3 (out, out, out);
+
+		/* eq. 20 */
+		float common = D * 0.25f / cosNO;
+		float out = G * common;
+
+		/* eq. 2 in distribution of visible normals sampling
+		 * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */
+
+		/* eq. 38 - but see also:
+		 * eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf
+		 * pdf = pm * 0.25 / dot(m, I); */
+		*pdf = G1o * common;
+
+		return make_float3(out, out, out);
 	}
-	return make_float3 (0, 0, 0);
+
+	return make_float3(0, 0, 0);
 }
 
 ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float m_ag = max(sc->data0, 1e-4f);
-	float m_eta = sc->data1;
+	float alpha_x = sc->data0;
+	float alpha_y = sc->data1;
+	float m_eta = sc->data2;
 	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 	float3 N = sc->N;
 
-	if(!m_refractive || m_ag <= 1e-4f)
-		return make_float3 (0, 0, 0);
+	if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
+		return make_float3(0, 0, 0);
+
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
+
 	if(cosNO <= 0 || cosNI >= 0)
-		return make_float3 (0, 0, 0); // vectors on same side -- not possible
-	// compute half-vector of the refraction (eq. 16)
+		return make_float3(0, 0, 0); /* vectors on same side -- not possible */
+
+	/* compute half-vector of the refraction (eq. 16) */
 	float3 ht = -(m_eta * omega_in + I);
 	float3 Ht = normalize(ht);
 	float cosHO = dot(Ht, I);
-
 	float cosHI = dot(Ht, omega_in);
-	// eq. 33: first we calculate D(m) with m=Ht:
-	float alpha2 = m_ag * m_ag;
+
+	/* those situations makes chi+ terms in eq. 33, 34 be zero */
+	if(dot(Ht, N) <= 0.0f || cosHO * cosNO <= 0.0f || cosHI * cosNI <= 0.0f)
+		return make_float3(0.0f, 0.0f, 0.0f);
+
+	float D, G1o, G1i;
+
+	/* eq. 33: first we calculate D(m) with m=Ht: */
+	float alpha2 = alpha_x * alpha_y;
 	float cosThetaM = dot(N, Ht);
 	float cosThetaM2 = cosThetaM * cosThetaM;
 	float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
 	float cosThetaM4 = cosThetaM2 * cosThetaM2;
-	float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
-	// eq. 34: now calculate G1(i,m) and G1(o,m)
-	float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
-	float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); 
+	D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+
+	/* eq. 34: now calculate G1(i,m) and G1(o,m) */
+	G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
+	G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); 
+
 	float G = G1o * G1i;
-	// probability
-	float invHt2 = 1 / dot(ht, ht);
-	*pdf = D * fabsf(cosThetaM) * (fabsf(cosHI) * (m_eta * m_eta)) * invHt2;
-	float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D) * invHt2) / cosNO;
-	return make_float3 (out, out, out);
+
+	/* probability */
+	float Ht2 = dot(ht, ht);
+
+	/* eq. 2 in distribution of visible normals sampling
+	 * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */
+
+	/* out = fabsf(cosHI * cosHO) * (m_eta * m_eta) * G * D / (cosNO * Ht2)
+	 * pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2 */
+	float common = D * (m_eta * m_eta) / (cosNO * Ht2);
+	float out = G * fabsf(cosHI * cosHO) * common;
+	*pdf = G1o * cosHO * fabsf(cosHI) * common;
+
+	return make_float3(out, out, out);
 }
 
-ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float m_ag = sc->data0;
+	float alpha_x = sc->data0;
+	float alpha_y = sc->data1;
 	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 	float3 N = sc->N;
 
 	float cosNO = dot(N, I);
 	if(cosNO > 0) {
 		float3 X, Y, Z = N;
-		make_orthonormals(Z, &X, &Y);
-		// generate a random microfacet normal m
-		// eq. 35,36:
-		// we take advantage of cos(atan(x)) == 1/sqrt(1+x^2)
-		//tttt  and sin(atan(x)) == x/sqrt(1+x^2)
-		float alpha2 = m_ag * m_ag;
-		float tanThetaM2 = alpha2 * randu / (1 - randu);
-		float cosThetaM  = 1 / safe_sqrtf(1 + tanThetaM2);
-		float sinThetaM  = cosThetaM * safe_sqrtf(tanThetaM2);
-		float phiM = M_2PI_F * randv;
-		float3 m = (cosf(phiM) * sinThetaM) * X +
-		           (sinf(phiM) * sinThetaM) * Y +
-		           (             cosThetaM) * Z;
+
+		if(alpha_x == alpha_y)
+			make_orthonormals(Z, &X, &Y);
+		else
+			make_orthonormals_tangent(Z, sc->T, &X, &Y);
+
+		/* importance sampling with distribution of visible normals. vectors are
+		 * transformed to local space before and after */
+		float3 local_I = make_float3(dot(X, I), dot(Y, I), cosNO);
+		float3 local_m;
+		float G1o;
+
+		local_m = microfacet_sample_stretched(kg, local_I, alpha_x, alpha_y,
+			randu, randv, false, &G1o);
+
+		float3 m = X*local_m.x + Y*local_m.y + Z*local_m.z;
+		float cosThetaM = local_m.z;
+
+		/* reflection or refraction? */
 		if(!m_refractive) {
 			float cosMO = dot(m, I);
+
 			if(cosMO > 0) {
-				// eq. 39 - compute actual reflected direction
+				/* eq. 39 - compute actual reflected direction */
 				*omega_in = 2 * cosMO * m - I;
+
 				if(dot(Ng, *omega_in) > 0) {
-					if (m_ag <= 1e-4f) {
-						// some high number for MIS
+					if(fmaxf(alpha_x, alpha_y) <= 1e-4f) {
+						/* some high number for MIS */
 						*pdf = 1e6f;
 						*eval = make_float3(1e6f, 1e6f, 1e6f);
 					}
 					else {
-						// microfacet normal is visible to this ray
-						// eq. 33
-						float cosThetaM2 = cosThetaM * cosThetaM;
-						float cosThetaM4 = cosThetaM2 * cosThetaM2;
-						float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
-						// eq. 24
-						float pm = D * cosThetaM;
-						// convert into pdf of the sampled direction
-						// eq. 38 - but see also:
-						// eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf
-						*pdf = pm * 0.25f / cosMO;
-						// eval BRDF*cosNI
-						float cosNI = dot(N, *omega_in);
-						// eq. 34: now calculate G1(i,m) and G1(o,m)
-						float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
-						float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); 
-						float G = G1o * G1i;
-						// eq. 20: (F*G*D)/(4*in*on)
-						float out = (G * D) * 0.25f / cosNO;
+						/* microfacet normal is visible to this ray */
+						/* eq. 33 */
+						float alpha2 = alpha_x * alpha_y;
+						float D, G1i;
+
+						if(alpha_x == alpha_y) {
+							/* isotropic */
+							float cosThetaM2 = cosThetaM * cosThetaM;
+							float cosThetaM4 = cosThetaM2 * cosThetaM2;
+							float tanThetaM2 = 1/(cosThetaM2) - 1;
+							D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+
+							/* eval BRDF*cosNI */
+							float cosNI = dot(N, *omega_in);
+
+							/* eq. 34: now calculate G1(i,m) */
+							G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); 
+						}
+						else {
+							/* anisotropic distribution */
+							float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m));
+							float slope_x = -local_m.x/(local_m.z*alpha_x);
+							float slope_y = -local_m.y/(local_m.z*alpha_y);
+							float slope_len = 1 + slope_x*slope_x + slope_y*slope_y;
+
+							float cosThetaM = local_m.z;
+							float cosThetaM2 = cosThetaM * cosThetaM;
+							float cosThetaM4 = cosThetaM2 * cosThetaM2;
+
+							D = 1 / ((slope_len * slope_len) * M_PI_F * alpha2 * cosThetaM4);
+
+							/* calculate G1(i,m) */
+							float cosNI = dot(N, *omega_in);
+
+							float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI);
+							float cosPhiI = dot(*omega_in, X);
+							float sinPhiI = dot(*omega_in, Y);
+
+							float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y);
+							alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI;
+
+							G1i = 2 / (1 + safe_sqrtf(1 + alphaI2 * tanThetaI2));
+						}
+
+						/* see eval function for derivation */
+						float common = (G1o * D) * 0.25f / cosNO;
+						float out = G1i * common;
+						*pdf = common;
+
 						*eval = make_float3(out, out, out);
 					}
 
@@ -198,14 +585,15 @@ ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, fl
 			}
 		}
 		else {
-			// CAUTION: the i and o variables are inverted relative to the paper
-			// eq. 39 - compute actual refractive direction
+			/* CAUTION: the i and o variables are inverted relative to the paper
+			 * eq. 39 - compute actual refractive direction */
 			float3 R, T;
 #ifdef __RAY_DIFFERENTIALS__
 			float3 dRdx, dRdy, dTdx, dTdy;
 #endif
-			float m_eta = sc->data1;
+			float m_eta = sc->data2;
 			bool inside;
+
 			fresnel_dielectric(m_eta, m, I, &R, &T,
 #ifdef __RAY_DIFFERENTIALS__
 				dIdx, dIdy, &dRdx, &dRdy, &dTdx, &dTdy,
@@ -213,38 +601,43 @@ ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, fl
 				&inside);
 			
 			if(!inside) {
+
 				*omega_in = T;
 #ifdef __RAY_DIFFERENTIALS__
 				*domega_in_dx = dTdx;
 				*domega_in_dy = dTdy;
 #endif
 
-				if (m_ag <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) {
-					// some high number for MIS
+				if(fmaxf(alpha_x, alpha_y) <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) {
+					/* some high number for MIS */
 					*pdf = 1e6f;
 					*eval = make_float3(1e6f, 1e6f, 1e6f);
 				}
 				else {
-					// eq. 33
+					/* eq. 33 */
+					float alpha2 = alpha_x * alpha_y;
 					float cosThetaM2 = cosThetaM * cosThetaM;
 					float cosThetaM4 = cosThetaM2 * cosThetaM2;
+					float tanThetaM2 = 1/(cosThetaM2) - 1;
 					float D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
-					// eq. 24
-					float pm = D * cosThetaM;
-					// eval BRDF*cosNI
+
+					/* eval BRDF*cosNI */
 					float cosNI = dot(N, *omega_in);
-					// eq. 34: now calculate G1(i,m) and G1(o,m)
-					float G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
+
+					/* eq. 34: now calculate G1(i,m) */
 					float G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); 
-					float G = G1o * G1i;
-					// eq. 21
+
+					/* eq. 21 */
 					float cosHI = dot(m, *omega_in);
 					float cosHO = dot(m, I);
 					float Ht2 = m_eta * cosHI + cosHO;
 					Ht2 *= Ht2;
-					float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D)) / (cosNO * Ht2);
-					// eq. 38 and eq. 17
-					*pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2;
+
+					/* see eval function for derivation */
+					float common = (G1o * D) * (m_eta * m_eta) / (cosNO * Ht2);
+					float out = G1i * fabsf(cosHI * cosHO) * common;
+					*pdf = cosHO * fabsf(cosHI) * common;
+
 					*eval = make_float3(out, out, out);
 				}
 			}
@@ -253,19 +646,33 @@ ccl_device int bsdf_microfacet_ggx_sample(const ShaderClosure *sc, float3 Ng, fl
 	return (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY;
 }
 
-/* BECKMANN */
+/* Beckmann microfacet with Smith shadow-masking from:
+ *
+ * Microfacet Models for Refraction through Rough Surfaces
+ * B. Walter, S. R. Marschner, H. Li, K. E. Torrance, EGSR 2007 */
 
 ccl_device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ab */
+	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data1 = sc->data0; /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ID;
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
 }
 
+ccl_device int bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc)
+{
+	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */
+
+	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID;
+	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
+}
+
 ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc)
 {
-	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* m_ab */
+	sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
+	sc->data1 = sc->data0; /* alpha_y */
 
 	sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
@@ -273,155 +680,257 @@ ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure *sc)
 
 ccl_device void bsdf_microfacet_beckmann_blur(ShaderClosure *sc, float roughness)
 {
-	sc->data0 = fmaxf(roughness, sc->data0); /* m_ab */
+	sc->data0 = fmaxf(roughness, sc->data0); /* alpha_x */
+	sc->data1 = fmaxf(roughness, sc->data1); /* alpha_y */
 }
 
 ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float m_ab = max(sc->data0, 1e-4f);
+	float alpha_x = sc->data0;
+	float alpha_y = sc->data1;
 	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 	float3 N = sc->N;
 
-	if(m_refractive || m_ab <= 1e-4f)
-		return make_float3 (0, 0, 0);
+	if(m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
+		return make_float3(0, 0, 0);
+
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
+
 	if(cosNO > 0 && cosNI > 0) {
-	   // get half vector
-	   float3 Hr = normalize(omega_in + I);
-	   // eq. 20: (F*G*D)/(4*in*on)
-	   // eq. 25: first we calculate D(m) with m=Hr:
-	   float alpha2 = m_ab * m_ab;
-	   float cosThetaM = dot(N, Hr);
-	   float cosThetaM2 = cosThetaM * cosThetaM;
-	   float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
-	   float cosThetaM4 = cosThetaM2 * cosThetaM2;
-	   float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 *  cosThetaM4);
-	   // eq. 26, 27: now calculate G1(i,m) and G1(o,m)
-	   float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO)));
-	   float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
-	   float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f;
-	   float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
-	   float G = G1o * G1i;
-	   float out = (G * D) * 0.25f / cosNO;
-	   // eq. 24
-	   float pm = D * cosThetaM;
-	   // convert into pdf of the sampled direction
-	   // eq. 38 - but see also:
-	   // eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf
-	   *pdf = pm * 0.25f / dot(Hr, I);
-	   return make_float3 (out, out, out);
+		/* get half vector */
+		float3 m = normalize(omega_in + I);
+
+		float alpha2 = alpha_x * alpha_y;
+		float D, G1o, G1i;
+
+		if(alpha_x == alpha_y) {
+			/* isotropic
+			 * eq. 20: (F*G*D)/(4*in*on)
+			 * eq. 25: first we calculate D(m) */
+			float cosThetaM = dot(N, m);
+			float cosThetaM2 = cosThetaM * cosThetaM;
+			float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
+			float cosThetaM4 = cosThetaM2 * cosThetaM2;
+			D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 * cosThetaM4);
+
+			/* eq. 26, 27: now calculate G1(i,m) and G1(o,m) */
+			float ao = 1 / (alpha_x * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO)));
+			float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
+			G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f;
+			G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
+		}
+		else {
+			/* anisotropic */
+			float3 X, Y, Z = N;
+			make_orthonormals_tangent(Z, sc->T, &X, &Y);
+
+			/* distribution */
+			float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m));
+			float slope_x = -local_m.x/(local_m.z*alpha_x);
+			float slope_y = -local_m.y/(local_m.z*alpha_y);
+
+			float cosThetaM = local_m.z;
+			float cosThetaM2 = cosThetaM * cosThetaM;
+			float cosThetaM4 = cosThetaM2 * cosThetaM2;
+
+			D = expf(-slope_x*slope_x - slope_y*slope_y) / (M_PI_F * alpha2 * cosThetaM4);
+
+			/* G1(i,m) and G1(o,m) */
+			float tanThetaO2 = (1 - cosNO * cosNO) / (cosNO * cosNO);
+			float cosPhiO = dot(I, X);
+			float sinPhiO = dot(I, Y);
+
+			float alphaO2 = (cosPhiO*cosPhiO)*(alpha_x*alpha_x) + (sinPhiO*sinPhiO)*(alpha_y*alpha_y);
+			alphaO2 /= cosPhiO*cosPhiO + sinPhiO*sinPhiO;
+
+			float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI);
+			float cosPhiI = dot(omega_in, X);
+			float sinPhiI = dot(omega_in, Y);
+
+			float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y);
+			alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI;
+
+			float ao = 1 / (safe_sqrtf(alphaO2 * tanThetaO2));
+			float ai = 1 / (safe_sqrtf(alphaI2 * tanThetaI2));
+			G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f;
+			G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
+		}
+
+		float G = G1o * G1i;
+
+		/* eq. 20 */
+		float common = D * 0.25f / cosNO;
+		float out = G * common;
+
+		/* eq. 2 in distribution of visible normals sampling
+		 * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */
+
+		/* eq. 38 - but see also:
+		 * eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf
+		 * pdf = pm * 0.25 / dot(m, I); */
+		*pdf = G1o * common;
+
+		return make_float3(out, out, out);
 	}
-	return make_float3 (0, 0, 0);
+
+	return make_float3(0, 0, 0);
 }
 
 ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
 {
-	float m_ab = max(sc->data0, 1e-4f);
-	float m_eta = sc->data1;
+	float alpha_x = sc->data0;
+	float alpha_y = sc->data1;
+	float m_eta = sc->data2;
 	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 	float3 N = sc->N;
 
-	if(!m_refractive || m_ab <= 1e-4f)
-		return make_float3 (0, 0, 0);
+	if(!m_refractive || fmaxf(alpha_x, alpha_y) <= 1e-4f)
+		return make_float3(0, 0, 0);
+
 	float cosNO = dot(N, I);
 	float cosNI = dot(N, omega_in);
+
 	if(cosNO <= 0 || cosNI >= 0)
-		return make_float3 (0, 0, 0);
-	// compute half-vector of the refraction (eq. 16)
+		return make_float3(0, 0, 0);
+
+	/* compute half-vector of the refraction (eq. 16) */
 	float3 ht = -(m_eta * omega_in + I);
 	float3 Ht = normalize(ht);
 	float cosHO = dot(Ht, I);
-
 	float cosHI = dot(Ht, omega_in);
-	// eq. 33: first we calculate D(m) with m=Ht:
-	float alpha2 = m_ab * m_ab;
+
+	/* those situations makes chi+ terms in eq. 25, 27 be zero */
+	if(dot(Ht, N) <= 0.0f || cosHO * cosNO <= 0.0f || cosHI * cosNI <= 0.0f)
+		return make_float3(0.0f, 0.0f, 0.0f);
+
+	/* eq. 25: first we calculate D(m) with m=Ht: */
+	float alpha2 = alpha_x * alpha_y;
 	float cosThetaM = min(dot(N, Ht), 1.0f);
 	float cosThetaM2 = cosThetaM * cosThetaM;
 	float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
 	float cosThetaM4 = cosThetaM2 * cosThetaM2;
 	float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 *  cosThetaM4);
-	// eq. 26, 27: now calculate G1(i,m) and G1(o,m)
-	float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO)));
-	float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
+
+	/* eq. 26, 27: now calculate G1(i,m) and G1(o,m) */
+	float ao = 1 / (alpha_x * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO)));
+	float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
 	float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f;
 	float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
 	float G = G1o * G1i;
-	// probability
-	float invHt2 = 1 / dot(ht, ht);
-	*pdf = D * fabsf(cosThetaM) * (fabsf(cosHI) * (m_eta * m_eta)) * invHt2;
-	float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D) * invHt2) / cosNO;
-	return make_float3 (out, out, out);
+
+	/* probability */
+	float Ht2 = dot(ht, ht);
+
+	/* eq. 2 in distribution of visible normals sampling
+	 * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */
+
+	/* out = fabsf(cosHI * cosHO) * (m_eta * m_eta) * G * D / (cosNO * Ht2)
+	 * pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2 */
+	float common = D * (m_eta * m_eta) / (cosNO * Ht2);
+	float out = G * fabsf(cosHI * cosHO) * common;
+	*pdf = G1o * cosHO * fabsf(cosHI) * common;
+
+	return make_float3(out, out, out);
 }
 
-ccl_device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
+ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
 {
-	float m_ab = sc->data0;
+	float alpha_x = sc->data0;
+	float alpha_y = sc->data1;
 	int m_refractive = sc->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 	float3 N = sc->N;
 
 	float cosNO = dot(N, I);
 	if(cosNO > 0) {
 		float3 X, Y, Z = N;
-		make_orthonormals(Z, &X, &Y);
-		// generate a random microfacet normal m
-		// eq. 35,36:
-		// we take advantage of cos(atan(x)) == 1/sqrt(1+x^2)
-		//tttt  and sin(atan(x)) == x/sqrt(1+x^2)
-		float alpha2 = m_ab * m_ab;
-		float tanThetaM, cosThetaM;
-
-		if(alpha2 == 0.0f) {
-			tanThetaM = 0.0f;
-			cosThetaM = 1.0f;
-		}
-		else {
-			tanThetaM = safe_sqrtf(-alpha2 * logf(1 - randu));
-			cosThetaM = 1 / safe_sqrtf(1 + tanThetaM * tanThetaM);
-		}
 
-		float sinThetaM = cosThetaM * tanThetaM;
-		float phiM = M_2PI_F * randv;
-		float3 m = (cosf(phiM) * sinThetaM) * X +
-		           (sinf(phiM) * sinThetaM) * Y +
-		           (             cosThetaM) * Z;
+		if(alpha_x == alpha_y)
+			make_orthonormals(Z, &X, &Y);
+		else
+			make_orthonormals_tangent(Z, sc->T, &X, &Y);
+
+		/* importance sampling with distribution of visible normals. vectors are
+		 * transformed to local space before and after */
+		float3 local_I = make_float3(dot(X, I), dot(Y, I), cosNO);
+		float3 local_m;
+		float G1o;
 
+		local_m = microfacet_sample_stretched(kg, local_I, alpha_x, alpha_x,
+			randu, randv, true, &G1o);
+
+		float3 m = X*local_m.x + Y*local_m.y + Z*local_m.z;
+		float cosThetaM = local_m.z;
+
+		/* reflection or refraction? */
 		if(!m_refractive) {
 			float cosMO = dot(m, I);
+
 			if(cosMO > 0) {
-				// eq. 39 - compute actual reflected direction
+				/* eq. 39 - compute actual reflected direction */
 				*omega_in = 2 * cosMO * m - I;
+
 				if(dot(Ng, *omega_in) > 0) {
-					if (m_ab <= 1e-4f) {
-						// some high number for MIS
+					if(fmaxf(alpha_x, alpha_y) <= 1e-4f) {
+						/* some high number for MIS */
 						*pdf = 1e6f;
 						*eval = make_float3(1e6f, 1e6f, 1e6f);
 					}
 					else {
-						// microfacet normal is visible to this ray
-						// eq. 25
-						float cosThetaM2 = cosThetaM * cosThetaM;
-						float tanThetaM2 = tanThetaM * tanThetaM;
-						float cosThetaM4 = cosThetaM2 * cosThetaM2;
-						float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 *  cosThetaM4);
-						// eq. 24
-						float pm = D * cosThetaM;
-						// convert into pdf of the sampled direction
-						// eq. 38 - but see also:
-						// eq. 17 in http://www.graphics.cornell.edu/~bjw/wardnotes.pdf
-						*pdf = pm * 0.25f / cosMO;
-						// Eval BRDF*cosNI
-						float cosNI = dot(N, *omega_in);
-						// eq. 26, 27: now calculate G1(i,m) and G1(o,m)
-						float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO)));
-						float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
-						float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f;
-						float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
+						/* microfacet normal is visible to this ray
+						 * eq. 25 */
+						float alpha2 = alpha_x * alpha_y;
+						float D, G1i;
+
+						if(alpha_x == alpha_y) {
+							/* istropic distribution */
+							float cosThetaM2 = cosThetaM * cosThetaM;
+							float cosThetaM4 = cosThetaM2 * cosThetaM2;
+							float tanThetaM2 = 1/(cosThetaM2) - 1;
+							D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 *  cosThetaM4);
+
+							/* eval BRDF*cosNI */
+							float cosNI = dot(N, *omega_in);
+
+							/* eq. 26, 27: now calculate G1(i,m) */
+							float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
+							G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
+						}
+						else {
+							/* anisotropic distribution */
+							float3 local_m = make_float3(dot(X, m), dot(Y, m), dot(Z, m));
+							float slope_x = -local_m.x/(local_m.z*alpha_x);
+							float slope_y = -local_m.y/(local_m.z*alpha_y);
+
+							float cosThetaM = local_m.z;
+							float cosThetaM2 = cosThetaM * cosThetaM;
+							float cosThetaM4 = cosThetaM2 * cosThetaM2;
+
+							D = expf(-slope_x*slope_x - slope_y*slope_y) / (M_PI_F * alpha2 * cosThetaM4);
+
+							/* G1(i,m) */
+							float cosNI = dot(N, *omega_in);
+							float tanThetaI2 = (1 - cosNI * cosNI) / (cosNI * cosNI);
+							float cosPhiI = dot(*omega_in, X);
+							float sinPhiI = dot(*omega_in, Y);
+
+							float alphaI2 = (cosPhiI*cosPhiI)*(alpha_x*alpha_x) + (sinPhiI*sinPhiI)*(alpha_y*alpha_y);
+							alphaI2 /= cosPhiI*cosPhiI + sinPhiI*sinPhiI;
+
+							float ai = 1 / (safe_sqrtf(alphaI2 * tanThetaI2));
+							G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
+						}
+
 						float G = G1o * G1i;
-						// eq. 20: (F*G*D)/(4*in*on)
-						float out = (G * D) * 0.25f / cosNO;
+
+						/* see eval function for derivation */
+						float common = D * 0.25f / cosNO;
+						float out = G * common;
+						*pdf = G1o * common;
+
 						*eval = make_float3(out, out, out);
 					}
+
 #ifdef __RAY_DIFFERENTIALS__
 					*domega_in_dx = (2 * dot(m, dIdx)) * m - dIdx;
 					*domega_in_dy = (2 * dot(m, dIdy)) * m - dIdy;
@@ -430,14 +939,15 @@ ccl_device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 N
 			}
 		}
 		else {
-			// CAUTION: the i and o variables are inverted relative to the paper
-			// eq. 39 - compute actual refractive direction
+			/* CAUTION: the i and o variables are inverted relative to the paper
+			 * eq. 39 - compute actual refractive direction */
 			float3 R, T;
 #ifdef __RAY_DIFFERENTIALS__
 			float3 dRdx, dRdy, dTdx, dTdy;
 #endif
-			float m_eta = sc->data1;
+			float m_eta = sc->data2;
 			bool inside;
+
 			fresnel_dielectric(m_eta, m, I, &R, &T,
 #ifdef __RAY_DIFFERENTIALS__
 				dIdx, dIdy, &dRdx, &dRdy, &dTdx, &dTdy,
@@ -446,39 +956,44 @@ ccl_device int bsdf_microfacet_beckmann_sample(const ShaderClosure *sc, float3 N
 
 			if(!inside) {
 				*omega_in = T;
+
 #ifdef __RAY_DIFFERENTIALS__
 				*domega_in_dx = dTdx;
 				*domega_in_dy = dTdy;
 #endif
-				if (m_ab <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) {
-					// some high number for MIS
+
+				if(fmaxf(alpha_x, alpha_y) <= 1e-4f || fabsf(m_eta - 1.0f) < 1e-4f) {
+					/* some high number for MIS */
 					*pdf = 1e6f;
 					*eval = make_float3(1e6f, 1e6f, 1e6f);
 				}
 				else {
-					// eq. 33
+					/* eq. 33 */
+					float alpha2 = alpha_x * alpha_y;
 					float cosThetaM2 = cosThetaM * cosThetaM;
-					float tanThetaM2 = tanThetaM * tanThetaM;
 					float cosThetaM4 = cosThetaM2 * cosThetaM2;
+					float tanThetaM2 = 1/(cosThetaM2) - 1;
 					float D = expf(-tanThetaM2 / alpha2) / (M_PI_F * alpha2 *  cosThetaM4);
-					// eq. 24
-					float pm = D * cosThetaM;
-					// eval BRDF*cosNI
+
+					/* eval BRDF*cosNI */
 					float cosNI = dot(N, *omega_in);
-					// eq. 26, 27: now calculate G1(i,m) and G1(o,m)
-					float ao = 1 / (m_ab * safe_sqrtf((1 - cosNO * cosNO) / (cosNO * cosNO)));
-					float ai = 1 / (m_ab * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
-					float G1o = ao < 1.6f ? (3.535f * ao + 2.181f * ao * ao) / (1 + 2.276f * ao + 2.577f * ao * ao) : 1.0f;
+
+					/* eq. 26, 27: now calculate G1(i,m) */
+					float ai = 1 / (alpha_x * safe_sqrtf((1 - cosNI * cosNI) / (cosNI * cosNI)));
 					float G1i = ai < 1.6f ? (3.535f * ai + 2.181f * ai * ai) / (1 + 2.276f * ai + 2.577f * ai * ai) : 1.0f;
 					float G = G1o * G1i;
-					// eq. 21
+
+					/* eq. 21 */
 					float cosHI = dot(m, *omega_in);
 					float cosHO = dot(m, I);
 					float Ht2 = m_eta * cosHI + cosHO;
 					Ht2 *= Ht2;
-					float out = (fabsf(cosHI * cosHO) * (m_eta * m_eta) * (G * D)) / (cosNO * Ht2);
-					// eq. 38 and eq. 17
-					*pdf = pm * (m_eta * m_eta) * fabsf(cosHI) / Ht2;
+
+					/* see eval function for derivation */
+					float common = D * (m_eta * m_eta) / (cosNO * Ht2);
+					float out = G * fabsf(cosHI * cosHO) * common;
+					*pdf = G1o * cosHO * fabsf(cosHI) * common;
+
 					*eval = make_float3(out, out, out);
 				}
 			}
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index b3dcb9dcc38..05816bac2c1 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -111,16 +111,20 @@ ccl_device float fresnel_dielectric_cos(float cosi, float eta)
 	return 1.0f; // TIR(no refracted component)
 }
 
-ccl_device float fresnel_conductor(float cosi, float eta, float k)
+#if 0
+ccl_device float3 fresnel_conductor(float cosi, const float3 eta, const float3 k)
 {
-	float tmp_f = eta * eta + k * k;
-	float tmp = tmp_f * cosi * cosi;
-	float Rparl2 = (tmp - (2.0f * eta * cosi) + 1)/
-	               (tmp + (2.0f * eta * cosi) + 1);
-	float Rperp2 = (tmp_f - (2.0f * eta * cosi) + cosi * cosi)/
-	               (tmp_f + (2.0f * eta * cosi) + cosi * cosi);
+	float3 cosi2 = make_float3(cosi*cosi);
+	float3 one = make_float3(1.0f, 1.0f, 1.0f);
+	float3 tmp_f = eta * eta + k * k;
+	float3 tmp = tmp_f * cosi2;
+	float3 Rparl2 = (tmp - (2.0f * eta * cosi) + one) /
+					(tmp + (2.0f * eta * cosi) + one);
+	float3 Rperp2 = (tmp_f - (2.0f * eta * cosi) + cosi2) /
+					(tmp_f + (2.0f * eta * cosi) + cosi2);
 	return(Rparl2 + Rperp2) * 0.5f;
 }
+#endif
 
 ccl_device float smooth_step(float edge0, float edge1, float x)
 {
diff --git a/intern/cycles/kernel/closure/bsdf_ward.h b/intern/cycles/kernel/closure/bsdf_ward.h
deleted file mode 100644
index c9de615a011..00000000000
--- a/intern/cycles/kernel/closure/bsdf_ward.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Adapted from Open Shading Language with this license:
- *
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- * * Neither the name of Sony Pictures Imageworks nor the names of its
- *   contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __BSDF_WARD_H__
-#define __BSDF_WARD_H__
-
-CCL_NAMESPACE_BEGIN
-
-/* WARD */
-
-ccl_device int bsdf_ward_setup(ShaderClosure *sc)
-{
-	sc->data0 = clamp(sc->data0, 1e-4f, 1.0f); /* m_ax */
-	sc->data1 = clamp(sc->data1, 1e-4f, 1.0f); /* m_ay */
-
-	sc->type = CLOSURE_BSDF_WARD_ID;
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
-}
-
-ccl_device void bsdf_ward_blur(ShaderClosure *sc, float roughness)
-{
-	sc->data0 = fmaxf(roughness, sc->data0); /* m_ax */
-	sc->data1 = fmaxf(roughness, sc->data1); /* m_ay */
-}
-
-ccl_device float3 bsdf_ward_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
-	float m_ax = sc->data0;
-	float m_ay = sc->data1;
-	float3 N = sc->N;
-	float3 T = sc->T;
-
-	float cosNO = dot(N, I);
-	float cosNI = dot(N, omega_in);
-
-	if(cosNI > 0.0f && cosNO > 0.0f) {
-		cosNO = max(cosNO, 1e-4f);
-		cosNI = max(cosNI, 1e-4f);
-
-		// get half vector and get x,y basis on the surface for anisotropy
-		float3 H = normalize(omega_in + I); // normalize needed for pdf
-		float3 X, Y;
-		make_orthonormals_tangent(N, T, &X, &Y);
-		// eq. 4
-		float dotx = dot(H, X) / m_ax;
-		float doty = dot(H, Y) / m_ay;
-		float dotn = dot(H, N);
-		float exp_arg = (dotx * dotx + doty * doty) / (dotn * dotn);
-		float denom = (M_4PI_F * m_ax * m_ay * sqrtf(cosNO * cosNI));
-		float exp_val = expf(-exp_arg);
-		float out = cosNI * exp_val / denom;
-		float oh = dot(H, I);
-		denom = M_4PI_F * m_ax * m_ay * oh * dotn * dotn * dotn;
-		*pdf = exp_val / denom;
-		return make_float3 (out, out, out);
-	}
-
-	return make_float3 (0, 0, 0);
-}
-
-ccl_device float3 bsdf_ward_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
-	return make_float3(0.0f, 0.0f, 0.0f);
-}
-
-ccl_device int bsdf_ward_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
-{
-	float m_ax = sc->data0;
-	float m_ay = sc->data1;
-	float3 N = sc->N;
-	float3 T = sc->T;
-
-	float cosNO = dot(N, I);
-	if(cosNO > 0.0f) {
-		// get x,y basis on the surface for anisotropy
-		float3 X, Y;
-		make_orthonormals_tangent(N, T, &X, &Y);
-		// generate random angles for the half vector
-		// eq. 7 (taking care around discontinuities to keep
-		//ttoutput angle in the right quadrant)
-		// we take advantage of cos(atan(x)) == 1/sqrt(1+x^2)
-		//tttt  and sin(atan(x)) == x/sqrt(1+x^2)
-		float alphaRatio = m_ay / m_ax;
-		float cosPhi, sinPhi;
-		if(randu < 0.25f) {
-			float val = 4 * randu;
-			float tanPhi = alphaRatio * tanf(M_PI_2_F * val);
-			cosPhi = 1 / sqrtf(1 + tanPhi * tanPhi);
-			sinPhi = tanPhi * cosPhi;
-		}
-		else if(randu < 0.5f) {
-			float val = 1 - 4 * (0.5f - randu);
-			float tanPhi = alphaRatio * tanf(M_PI_2_F * val);
-			// phi = M_PI_F - phi;
-			cosPhi = -1 / sqrtf(1 + tanPhi * tanPhi);
-			sinPhi = -tanPhi * cosPhi;
-		}
-		else if(randu < 0.75f) {
-			float val = 4 * (randu - 0.5f);
-			float tanPhi = alphaRatio * tanf(M_PI_2_F * val);
-			//phi = M_PI_F + phi;
-			cosPhi = -1 / sqrtf(1 + tanPhi * tanPhi);
-			sinPhi = tanPhi * cosPhi;
-		}
-		else {
-			float val = 1 - 4 * (1 - randu);
-			float tanPhi = alphaRatio * tanf(M_PI_2_F * val);
-			// phi = M_2PI_F - phi;
-			cosPhi = 1 / sqrtf(1 + tanPhi * tanPhi);
-			sinPhi = -tanPhi * cosPhi;
-		}
-		// eq. 6
-		// we take advantage of cos(atan(x)) == 1/sqrt(1+x^2)
-		//tttt  and sin(atan(x)) == x/sqrt(1+x^2)
-		float thetaDenom = (cosPhi * cosPhi) / (m_ax * m_ax) + (sinPhi * sinPhi) / (m_ay * m_ay);
-		float tanTheta2 = -logf(1 - randv) / thetaDenom;
-		float cosTheta  = 1 / sqrtf(1 + tanTheta2);
-		float sinTheta  = cosTheta * sqrtf(tanTheta2);
-
-		float3 h; // already normalized becaused expressed from spherical coordinates
-		h.x = sinTheta * cosPhi;
-		h.y = sinTheta * sinPhi;
-		h.z = cosTheta;
-		// compute terms that are easier in local space
-		float dotx = h.x / m_ax;
-		float doty = h.y / m_ay;
-		float dotn = h.z;
-		// transform to world space
-		h = h.x * X + h.y * Y + h.z * N;
-		// generate the final sample
-		float oh = dot(h, I);
-		*omega_in = 2.0f * oh * h - I;
-		if(dot(Ng, *omega_in) > 0) {
-			float cosNI = dot(N, *omega_in);
-			if(cosNI > 0) {
-				cosNO = max(cosNO, 1e-4f);
-				cosNI = max(cosNI, 1e-4f);
-
-				// eq. 9
-				float exp_arg = (dotx * dotx + doty * doty) / (dotn * dotn);
-				float denom = M_4PI_F * m_ax * m_ay * oh * dotn * dotn * dotn;
-				*pdf = expf(-exp_arg) / denom;
-				// compiler will reuse expressions already computed
-				denom = (M_4PI_F * m_ax * m_ay * sqrtf(cosNO * cosNI));
-				float power = cosNI * expf(-exp_arg) / denom;
-				*eval = make_float3(power, power, power);
-#ifdef __RAY_DIFFERENTIALS__
-				*domega_in_dx = (2 * dot(N, dIdx)) * N - dIdx;
-				*domega_in_dy = (2 * dot(N, dIdy)) * N - dIdy;
-#endif
-			}
-		}
-	}
-	return LABEL_REFLECT|LABEL_GLOSSY;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __BSDF_WARD_H__ */
-
diff --git a/intern/cycles/kernel/closure/bsdf_westin.h b/intern/cycles/kernel/closure/bsdf_westin.h
deleted file mode 100644
index 9dc1c00bb3d..00000000000
--- a/intern/cycles/kernel/closure/bsdf_westin.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Adapted from Open Shading Language with this license:
- *
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011, Blender Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * * Redistributions of source code must retain the above copyright
- *   notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- *   notice, this list of conditions and the following disclaimer in the
- *   documentation and/or other materials provided with the distribution.
- * * Neither the name of Sony Pictures Imageworks nor the names of its
- *   contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __BSDF_WESTIN_H__
-#define __BSDF_WESTIN_H__
-
-CCL_NAMESPACE_BEGIN
-
-/* WESTIN BACKSCATTER */
-
-ccl_device int bsdf_westin_backscatter_setup(ShaderClosure *sc)
-{
-	float roughness = sc->data0;
-	roughness = clamp(roughness, 1e-5f, 1.0f);
-	float m_invroughness = 1.0f/roughness;
-
-	sc->type = CLOSURE_BSDF_WESTIN_BACKSCATTER_ID;
-	sc->data0 = m_invroughness;
-
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
-}
-
-ccl_device void bsdf_westin_backscatter_blur(ShaderClosure *sc, float roughness)
-{
-	float m_invroughness = sc->data0;
-	m_invroughness = min(1.0f/roughness, m_invroughness);
-	sc->data0 = m_invroughness;
-}
-
-ccl_device float3 bsdf_westin_backscatter_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
-	float m_invroughness = sc->data0;
-	float3 N = sc->N;
-
-	// pdf is implicitly 0 (no indirect sampling)
-	float cosNO = dot(N, I);
-	float cosNI = dot(N, omega_in);
-	if(cosNO > 0 && cosNI > 0) {
-		float cosine = dot(I, omega_in);
-		*pdf = cosine > 0 ? (m_invroughness + 1) * powf(cosine, m_invroughness) : 0;
-		*pdf *= 0.5f * M_1_PI_F;
-		return make_float3 (*pdf, *pdf, *pdf);
-	}
-	return make_float3 (0, 0, 0);
-}
-
-ccl_device float3 bsdf_westin_backscatter_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
-	return make_float3(0.0f, 0.0f, 0.0f);
-}
-
-ccl_device int bsdf_westin_backscatter_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
-{
-	float m_invroughness = sc->data0;
-	float3 N = sc->N;
-
-	float cosNO = dot(N, I);
-	if(cosNO > 0) {
-#ifdef __RAY_DIFFERENTIALS__
-		*domega_in_dx = dIdx;
-		*domega_in_dy = dIdy;
-#endif
-		float3 T, B;
-		make_orthonormals (I, &T, &B);
-		float phi = M_2PI_F * randu;
-		float cosTheta = powf(randv, 1 / (m_invroughness + 1));
-		float sinTheta2 = 1 - cosTheta * cosTheta;
-		float sinTheta = sinTheta2 > 0 ? sqrtf(sinTheta2) : 0;
-		*omega_in = (cosf(phi) * sinTheta) * T +
-		            (sinf(phi) * sinTheta) * B +
-		            (cosTheta) * I;
-		if(dot(Ng, *omega_in) > 0) {
-			// common terms for pdf and eval
-			float cosNI = dot(N, *omega_in);
-			// make sure the direction we chose is still in the right hemisphere
-			if(cosNI > 0)
-			{
-				*pdf = 0.5f * M_1_PI_F * powf(cosTheta, m_invroughness);
-				*pdf = (m_invroughness + 1) * (*pdf);
-				*eval = make_float3(*pdf, *pdf, *pdf);
-			}
-		}
-	}
-	return LABEL_REFLECT|LABEL_GLOSSY;
-}
-
-/* WESTIN SHEEN */
-
-ccl_device int bsdf_westin_sheen_setup(ShaderClosure *sc)
-{
-	/* float edginess = sc->data0; */
-	sc->type = CLOSURE_BSDF_WESTIN_SHEEN_ID;
-	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY;
-}
-
-ccl_device void bsdf_westin_sheen_blur(ShaderClosure *sc, float roughness)
-{
-}
-
-ccl_device float3 bsdf_westin_sheen_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
-	float m_edginess = sc->data0;
-	float3 N = sc->N;
-
-	// pdf is implicitly 0 (no indirect sampling)
-	float cosNO = dot(N, I);
-	float cosNI = dot(N, omega_in);
-	if(cosNO > 0 && cosNI > 0) {
-		float sinNO2 = 1 - cosNO * cosNO;
-		*pdf = cosNI * M_1_PI_F;
-		float westin = sinNO2 > 0 ? powf(sinNO2, 0.5f * m_edginess) * (*pdf) : 0;
-		return make_float3 (westin, westin, westin);
-	}
-	return make_float3 (0, 0, 0);
-}
-
-ccl_device float3 bsdf_westin_sheen_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
-{
-	return make_float3(0.0f, 0.0f, 0.0f);
-}
-
-ccl_device int bsdf_westin_sheen_sample(const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf)
-{
-	float m_edginess = sc->data0;
-	float3 N = sc->N;
-
-	// we are viewing the surface from the right side - send a ray out with cosine
-	// distribution over the hemisphere
-	sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
-	if(dot(Ng, *omega_in) > 0) {
-		// TODO: account for sheen when sampling
-		float cosNO = dot(N, I);
-		float sinNO2 = 1 - cosNO * cosNO;
-		float westin = sinNO2 > 0 ? powf(sinNO2, 0.5f * m_edginess) * (*pdf) : 0;
-		*eval = make_float3(westin, westin, westin);
-#ifdef __RAY_DIFFERENTIALS__
-		// TODO: find a better approximation for the diffuse bounce
-		*domega_in_dx = (2 * dot(N, dIdx)) * N - dIdx;
-		*domega_in_dy = (2 * dot(N, dIdy)) * N - dIdy;
-#endif
-	}
-	else {
-		pdf = 0;
-	}
-	return LABEL_REFLECT|LABEL_DIFFUSE;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __BSDF_WESTIN_H__ */
-
diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h
index dd7c25d581d..c5336e086b7 100644
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@@ -28,6 +28,13 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* Don't inline intersect functions on GPU, this is faster */
+#ifdef __KERNEL_GPU__
+#define ccl_device_intersect ccl_device_noinline
+#else
+#define ccl_device_intersect ccl_device_inline
+#endif
+
 /* BVH intersection function variations */
 
 #define BVH_INSTANCING			1
@@ -35,6 +42,8 @@ CCL_NAMESPACE_BEGIN
 #define BVH_HAIR				4
 #define BVH_HAIR_MINIMUM_WIDTH	8
 
+/* Regular BVH traversal */
+
 #define BVH_FUNCTION_NAME bvh_intersect
 #define BVH_FUNCTION_FEATURES 0
 #include "geom_bvh_traversal.h"
@@ -63,6 +72,8 @@ CCL_NAMESPACE_BEGIN
 #include "geom_bvh_traversal.h"
 #endif
 
+/* Subsurface scattering BVH traversal */
+
 #if defined(__SUBSURFACE__)
 #define BVH_FUNCTION_NAME bvh_intersect_subsurface
 #define BVH_FUNCTION_FEATURES 0
@@ -93,47 +104,72 @@ CCL_NAMESPACE_BEGIN
 #include "geom_bvh_subsurface.h"
 #endif
 
+/* Record all BVH intersection for shadows */
+
 #if defined(__SHADOW_RECORD_ALL__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all
 #define BVH_FUNCTION_FEATURES 0
 #include "geom_bvh_shadow.h"
 #endif
 
-#if defined(__SUBSURFACE__) && defined(__INSTANCING__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__INSTANCING__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING
 #include "geom_bvh_shadow.h"
 #endif
 
-#if defined(__SUBSURFACE__) && defined(__HAIR__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
 #include "geom_bvh_shadow.h"
 #endif
 
-#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__OBJECT_MOTION__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
 #include "geom_bvh_shadow.h"
 #endif
 
-#if defined(__SUBSURFACE__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
 #define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
 #include "geom_bvh_shadow.h"
 #endif
 
-/* to work around titan bug when using arrays instead of textures */
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
+/* Camera inside Volume BVH intersection */
+
+#if defined(__VOLUME__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume
+#define BVH_FUNCTION_FEATURES 0
+#include "geom_bvh_volume.h"
 #endif
-#ifdef __HAIR__ 
-bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax)
-#else
-bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
+
+#if defined(__VOLUME__) && defined(__INSTANCING__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__HAIR__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_hair
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#include "geom_bvh_volume.h"
+#endif
+
+#if defined(__VOLUME__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#define BVH_FUNCTION_NAME bvh_intersect_volume_hair_motion
+#define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
+#include "geom_bvh_volume.h"
 #endif
+
+ccl_device_intersect bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect,
+					 uint *lcg_state, float difl, float extmax)
 {
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
@@ -170,14 +206,8 @@ bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, I
 #endif /* __KERNEL_CPU__ */
 }
 
-/* to work around titan bug when using arrays instead of textures */
 #ifdef __SUBSURFACE__
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
+ccl_device_intersect uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
 {
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
@@ -215,14 +245,8 @@ uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection
 }
 #endif
 
-/* to work around titan bug when using arrays instead of textures */
 #ifdef __SHADOW_RECORD_ALL__
-#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
 {
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
@@ -240,20 +264,50 @@ uint scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection
 		return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
 #endif /* __HAIR__ */
 
-#ifdef __KERNEL_CPU__
-
 #ifdef __INSTANCING__
 	if(kernel_data.bvh.have_instancing)
 		return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
 #endif /* __INSTANCING__ */
 
 	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+}
+#endif
+
+#ifdef __VOLUME__
+ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
+                            const Ray *ray,
+                            Intersection *isect)
+{
+#ifdef __OBJECT_MOTION__
+	if(kernel_data.bvh.have_motion) {
+#ifdef __HAIR__
+		if(kernel_data.bvh.have_curves)
+			return bvh_intersect_volume_hair_motion(kg, ray, isect);
+#endif /* __HAIR__ */
+
+		return bvh_intersect_volume_motion(kg, ray, isect);
+	}
+#endif /* __OBJECT_MOTION__ */
+
+#ifdef __HAIR__
+	if(kernel_data.bvh.have_curves)
+		return bvh_intersect_volume_hair(kg, ray, isect);
+#endif /* __HAIR__ */
+
+#ifdef __KERNEL_CPU__
+
+#ifdef __INSTANCING__
+	if(kernel_data.bvh.have_instancing)
+		return bvh_intersect_volume_instancing(kg, ray, isect);
+#endif /* __INSTANCING__ */
+
+	return bvh_intersect_volume(kg, ray, isect);
 #else /* __KERNEL_CPU__ */
 
 #ifdef __INSTANCING__
-	return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+	return bvh_intersect_volume_instancing(kg, ray, isect);
 #else
-	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+	return bvh_intersect_volume(kg, ray, isect);
 #endif /* __INSTANCING__ */
 
 #endif /* __KERNEL_CPU__ */
diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h
index 98bf82b3b2d..aee4097d77e 100644
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h
@@ -68,15 +68,15 @@ ccl_device bool BVH_FUNCTION_NAME
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
 	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
 	
-	const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
-	__m128 Psplat[3], idirsplat[3];
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
 	shuffle_swap_t shufflexyz[3];
 
-	Psplat[0] = _mm_set_ps1(P.x);
-	Psplat[1] = _mm_set_ps1(P.y);
-	Psplat[2] = _mm_set_ps1(P.z);
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
 
-	__m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+	ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
 
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
@@ -132,27 +132,27 @@ ccl_device bool BVH_FUNCTION_NAME
 				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
 
 				/* fetch node data */
-				const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
 				const float4 cnodes = ((float4*)bvh_nodes)[3];
 
 				/* intersect ray against child nodes */
-				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
-				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
-				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
 
 				/* calculate { c0min, c1min, -c0max, -c1max} */
-				__m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat));
-				const __m128 tminmax = _mm_xor_ps(minmax, pn);
-				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
+				const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+				const ssef tminmax = minmax ^ pn;
+				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
 
 				/* decide which nodes to traverse next */
 #ifdef __VISIBILITY_FLAG__
 				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
-				traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
+				traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & PATH_RAY_SHADOW);
+				traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & PATH_RAY_SHADOW);
 #else
-				traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
-				traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+				traverseChild0 = (movemask(lrhit) & 1);
+				traverseChild1 = (movemask(lrhit) & 2);
 #endif
 #endif // __KERNEL_SSE2__
 
@@ -164,9 +164,7 @@ ccl_device bool BVH_FUNCTION_NAME
 #if !defined(__KERNEL_SSE2__)
 					bool closestChild1 = (c1min < c0min);
 #else
-					union { __m128 m128; float v[4]; } uminmax;
-					uminmax.m128 = tminmax;
-					bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+					bool closestChild1 = tminmax[1] < tminmax[0];
 #endif
 
 					if(closestChild1) {
@@ -254,8 +252,7 @@ ccl_device bool BVH_FUNCTION_NAME
 							if(kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
 #endif
 							{
-								float4 Ns = kernel_tex_fetch(__tri_normal, prim);
-								shader = __float_as_int(Ns.w);
+								shader =  kernel_tex_fetch(__tri_shader, prim);
 							}
 #ifdef __HAIR__
 							else {
@@ -301,12 +298,12 @@ ccl_device bool BVH_FUNCTION_NAME
 					num_hits_in_instance = 0;
 
 #if defined(__KERNEL_SSE2__)
-					Psplat[0] = _mm_set_ps1(P.x);
-					Psplat[1] = _mm_set_ps1(P.y);
-					Psplat[2] = _mm_set_ps1(P.z);
+					Psplat[0] = ssef(P.x);
+					Psplat[1] = ssef(P.y);
+					Psplat[2] = ssef(P.z);
 
 					isect_array->t = isect_t;
-					tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+					tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
 
 					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
@@ -348,13 +345,13 @@ ccl_device bool BVH_FUNCTION_NAME
 			}
 
 #if defined(__KERNEL_SSE2__)
-			Psplat[0] = _mm_set_ps1(P.x);
-			Psplat[1] = _mm_set_ps1(P.y);
-			Psplat[2] = _mm_set_ps1(P.z);
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
 
 			isect_t = tmax;
 			isect_array->t = isect_t;
-			tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
 
 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
index a19f05dd371..a8f57cffa78 100644
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@@ -65,15 +65,15 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
 	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
 	
-	const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
-	__m128 Psplat[3], idirsplat[3];
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
 	shuffle_swap_t shufflexyz[3];
 
-	Psplat[0] = _mm_set_ps1(P.x);
-	Psplat[1] = _mm_set_ps1(P.y);
-	Psplat[2] = _mm_set_ps1(P.z);
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
 
-	__m128 tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+	ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
 
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
@@ -131,25 +131,27 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
 
 				/* fetch node data */
-				const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
 				const float4 cnodes = ((float4*)bvh_nodes)[3];
 
 				/* intersect ray against child nodes */
-				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
-				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
-				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
 
-				const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn);
-				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
+				/* calculate { c0min, c1min, -c0max, -c1max} */
+				const ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+				const ssef tminmax = minmax ^ pn;
+				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
 
 				/* decide which nodes to traverse next */
 #ifdef __VISIBILITY_FLAG__
 				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
+				traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
+				traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
 #else
-				traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
-				traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+				traverseChild0 = (movemask(lrhit) & 1);
+				traverseChild1 = (movemask(lrhit) & 2);
 #endif
 #endif // __KERNEL_SSE2__
 
@@ -161,9 +163,7 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 #if !defined(__KERNEL_SSE2__)
 					bool closestChild1 = (c1min < c0min);
 #else
-					union { __m128 m128; float v[4]; } uminmax;
-					uminmax.m128 = tminmax;
-					bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+					bool closestChild1 = tminmax[1] < tminmax[0];
 #endif
 
 					if(closestChild1) {
@@ -243,11 +243,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 #endif
 
 #if defined(__KERNEL_SSE2__)
-						Psplat[0] = _mm_set_ps1(P.x);
-						Psplat[1] = _mm_set_ps1(P.y);
-						Psplat[2] = _mm_set_ps1(P.z);
+						Psplat[0] = ssef(P.x);
+						Psplat[1] = ssef(P.y);
+						Psplat[2] = ssef(P.z);
 
-						tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+						tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
 
 						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
@@ -279,11 +279,11 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 #endif
 
 #if defined(__KERNEL_SSE2__)
-			Psplat[0] = _mm_set_ps1(P.x);
-			Psplat[1] = _mm_set_ps1(P.y);
-			Psplat[2] = _mm_set_ps1(P.z);
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
 
-			tsplat = _mm_set_ps(-isect_t, -isect_t, 0.0f, 0.0f);
+			tsplat = ssef(0.0f, 0.0f, -isect_t, -isect_t);
 
 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h
index 9fd40f91471..114d30a479d 100644
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@@ -63,24 +63,28 @@ ccl_device bool BVH_FUNCTION_NAME
 #endif
 
 	isect->t = ray->t;
-	isect->object = OBJECT_NONE;
-	isect->prim = PRIM_NONE;
 	isect->u = 0.0f;
 	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+#if defined(__KERNEL_DEBUG__)
+	isect->num_traversal_steps = 0;
+#endif
 
 #if defined(__KERNEL_SSE2__)
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
 	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
 	
-	const __m128 pn = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0, 0));
-	__m128 Psplat[3], idirsplat[3];
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
 	shuffle_swap_t shufflexyz[3];
 
-	Psplat[0] = _mm_set_ps1(P.x);
-	Psplat[1] = _mm_set_ps1(P.y);
-	Psplat[2] = _mm_set_ps1(P.z);
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
 
-	__m128 tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+	ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
 
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
@@ -151,17 +155,17 @@ ccl_device bool BVH_FUNCTION_NAME
 				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
 
 				/* fetch node data */
-				const __m128 *bvh_nodes = (__m128*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
 				const float4 cnodes = ((float4*)bvh_nodes)[3];
 
 				/* intersect ray against child nodes */
-				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
-				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
-				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
 
 				/* calculate { c0min, c1min, -c0max, -c1max} */
-				__m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat));
-				const __m128 tminmax = _mm_xor_ps(minmax, pn);
+				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+				const ssef tminmax = minmax ^ pn;
 
 #if FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
@@ -182,16 +186,16 @@ ccl_device bool BVH_FUNCTION_NAME
 				}
 #endif
 
-				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
+				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
 
 				/* decide which nodes to traverse next */
 #ifdef __VISIBILITY_FLAG__
 				/* this visibility test gives a 5% performance hit, how to solve? */
-				traverseChild0 = (_mm_movemask_ps(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
-				traverseChild1 = (_mm_movemask_ps(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
+				traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
+				traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
 #else
-				traverseChild0 = (_mm_movemask_ps(lrhit) & 1);
-				traverseChild1 = (_mm_movemask_ps(lrhit) & 2);
+				traverseChild0 = (movemask(lrhit) & 1);
+				traverseChild1 = (movemask(lrhit) & 2);
 #endif
 #endif // __KERNEL_SSE2__
 
@@ -203,9 +207,7 @@ ccl_device bool BVH_FUNCTION_NAME
 #if !defined(__KERNEL_SSE2__)
 					bool closestChild1 = (c1min < c0min);
 #else
-					union { __m128 m128; float v[4]; } uminmax;
-					uminmax.m128 = tminmax;
-					bool closestChild1 = uminmax.v[1] < uminmax.v[0];
+					bool closestChild1 = tminmax[1] < tminmax[0];
 #endif
 
 					if(closestChild1) {
@@ -228,6 +230,10 @@ ccl_device bool BVH_FUNCTION_NAME
 						--stackPtr;
 					}
 				}
+
+#if defined(__KERNEL_DEBUG__)
+				isect->num_traversal_steps++;
+#endif
 			}
 
 			/* if node is leaf, fetch triangle list */
@@ -276,13 +282,17 @@ ccl_device bool BVH_FUNCTION_NAME
 							}
 						}
 
+#if defined(__KERNEL_DEBUG__)
+						isect->num_traversal_steps++;
+#endif
+
 						/* shadow ray early termination */
 #if defined(__KERNEL_SSE2__)
 						if(hit) {
 							if(visibility == PATH_RAY_SHADOW_OPAQUE)
 								return true;
 
-							tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+							tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
 						}
 #else
 						if(hit && visibility == PATH_RAY_SHADOW_OPAQUE)
@@ -304,11 +314,11 @@ ccl_device bool BVH_FUNCTION_NAME
 #endif
 
 #if defined(__KERNEL_SSE2__)
-					Psplat[0] = _mm_set_ps1(P.x);
-					Psplat[1] = _mm_set_ps1(P.y);
-					Psplat[2] = _mm_set_ps1(P.z);
+					Psplat[0] = ssef(P.x);
+					Psplat[1] = ssef(P.y);
+					Psplat[2] = ssef(P.z);
 
-					tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+					tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
 
 					gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
@@ -334,11 +344,11 @@ ccl_device bool BVH_FUNCTION_NAME
 #endif
 
 #if defined(__KERNEL_SSE2__)
-			Psplat[0] = _mm_set_ps1(P.x);
-			Psplat[1] = _mm_set_ps1(P.y);
-			Psplat[2] = _mm_set_ps1(P.z);
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
 
-			tsplat = _mm_set_ps(-isect->t, -isect->t, 0.0f, 0.0f);
+			tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
 
 			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
diff --git a/intern/cycles/kernel/geom/geom_bvh_volume.h b/intern/cycles/kernel/geom/geom_bvh_volume.h
new file mode 100644
index 00000000000..9dd8d226f5b
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_bvh_volume.h
@@ -0,0 +1,322 @@
+/*
+ * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
+ * and code copyright 2009-2012 Intel Corporation
+ *
+ * Modifications Copyright 2011-2014, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This is a template BVH traversal function for volumes, where
+ * various features can be enabled/disabled. This way we can compile optimized
+ * versions for each case without new features slowing things down.
+ *
+ * BVH_INSTANCING: object instancing
+ * BVH_HAIR: hair curve rendering
+ * BVH_MOTION: motion blur rendering
+ *
+ */
+
+#define FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
+
+ccl_device bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+                                  const Ray *ray,
+                                  Intersection *isect)
+{
+	/* todo:
+	 * - test if pushing distance on the stack helps (for non shadow rays)
+	 * - separate version for shadow rays
+	 * - likely and unlikely for if() statements
+	 * - test restrict attribute for pointers
+	 */
+
+	/* traversal stack in CUDA thread-local memory */
+	int traversalStack[BVH_STACK_SIZE];
+	traversalStack[0] = ENTRYPOINT_SENTINEL;
+
+	/* traversal variables in registers */
+	int stackPtr = 0;
+	int nodeAddr = kernel_data.bvh.root;
+
+	/* ray parameters in registers */
+	float3 P = ray->P;
+	float3 dir = bvh_clamp_direction(ray->D);
+	float3 idir = bvh_inverse_direction(dir);
+	int object = OBJECT_NONE;
+
+	const uint visibility = PATH_RAY_ALL_VISIBILITY;
+
+#if FEATURE(BVH_MOTION)
+	Transform ob_tfm;
+#endif
+
+	isect->t = ray->t;
+	isect->u = 0.0f;
+	isect->v = 0.0f;
+	isect->prim = PRIM_NONE;
+	isect->object = OBJECT_NONE;
+
+#if defined(__KERNEL_SSE2__)
+	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
+	const shuffle_swap_t shuf_swap = shuffle_swap_swap();
+	
+	const ssef pn = cast(ssei(0, 0, 0x80000000, 0x80000000));
+	ssef Psplat[3], idirsplat[3];
+	shuffle_swap_t shufflexyz[3];
+
+	Psplat[0] = ssef(P.x);
+	Psplat[1] = ssef(P.y);
+	Psplat[2] = ssef(P.z);
+
+	ssef tsplat(0.0f, 0.0f, -isect->t, -isect->t);
+
+	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+	/* traversal loop */
+	do {
+		do {
+			/* traverse internal nodes */
+			while(nodeAddr >= 0 && nodeAddr != ENTRYPOINT_SENTINEL) {
+				bool traverseChild0, traverseChild1;
+				int nodeAddrChild1;
+
+#if !defined(__KERNEL_SSE2__)
+				/* Intersect two child bounding boxes, non-SSE version */
+				float t = isect->t;
+
+				/* fetch node data */
+				float4 node0 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+0);
+				float4 node1 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+1);
+				float4 node2 = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+2);
+				float4 cnodes = kernel_tex_fetch(__bvh_nodes, nodeAddr*BVH_NODE_SIZE+3);
+
+				/* intersect ray against child nodes */
+				NO_EXTENDED_PRECISION float c0lox = (node0.x - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0hix = (node0.z - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c0loy = (node1.x - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0hiy = (node1.z - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c0loz = (node2.x - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0hiz = (node2.z - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+
+				NO_EXTENDED_PRECISION float c1lox = (node0.y - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1hix = (node0.w - P.x) * idir.x;
+				NO_EXTENDED_PRECISION float c1loy = (node1.y - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1hiy = (node1.w - P.y) * idir.y;
+				NO_EXTENDED_PRECISION float c1loz = (node2.y - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1hiz = (node2.w - P.z) * idir.z;
+				NO_EXTENDED_PRECISION float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
+				NO_EXTENDED_PRECISION float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+
+				/* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+				/* this visibility test gives a 5% performance hit, how to solve? */
+				traverseChild0 = (c0max >= c0min) && (__float_as_uint(cnodes.z) & visibility);
+				traverseChild1 = (c1max >= c1min) && (__float_as_uint(cnodes.w) & visibility);
+#else
+				traverseChild0 = (c0max >= c0min);
+				traverseChild1 = (c1max >= c1min);
+#endif
+
+#else // __KERNEL_SSE2__
+				/* Intersect two child bounding boxes, SSE3 version adapted from Embree */
+
+				/* fetch node data */
+				const ssef *bvh_nodes = (ssef*)kg->__bvh_nodes.data + nodeAddr*BVH_NODE_SIZE;
+				const float4 cnodes = ((float4*)bvh_nodes)[3];
+
+				/* intersect ray against child nodes */
+				const ssef tminmaxx = (shuffle_swap(bvh_nodes[0], shufflexyz[0]) - Psplat[0]) * idirsplat[0];
+				const ssef tminmaxy = (shuffle_swap(bvh_nodes[1], shufflexyz[1]) - Psplat[1]) * idirsplat[1];
+				const ssef tminmaxz = (shuffle_swap(bvh_nodes[2], shufflexyz[2]) - Psplat[2]) * idirsplat[2];
+
+				/* calculate { c0min, c1min, -c0max, -c1max} */
+				ssef minmax = max(max(tminmaxx, tminmaxy), max(tminmaxz, tsplat));
+				const ssef tminmax = minmax ^ pn;
+
+				const sseb lrhit = tminmax <= shuffle<2, 3, 0, 1>(tminmax);
+
+				/* decide which nodes to traverse next */
+#ifdef __VISIBILITY_FLAG__
+				/* this visibility test gives a 5% performance hit, how to solve? */
+				traverseChild0 = (movemask(lrhit) & 1) && (__float_as_uint(cnodes.z) & visibility);
+				traverseChild1 = (movemask(lrhit) & 2) && (__float_as_uint(cnodes.w) & visibility);
+#else
+				traverseChild0 = (movemask(lrhit) & 1);
+				traverseChild1 = (movemask(lrhit) & 2);
+#endif
+#endif // __KERNEL_SSE2__
+
+				nodeAddr = __float_as_int(cnodes.x);
+				nodeAddrChild1 = __float_as_int(cnodes.y);
+
+				if(traverseChild0 && traverseChild1) {
+					/* both children were intersected, push the farther one */
+#if !defined(__KERNEL_SSE2__)
+					bool closestChild1 = (c1min < c0min);
+#else
+					bool closestChild1 = tminmax[1] < tminmax[0];
+#endif
+
+					if(closestChild1) {
+						int tmp = nodeAddr;
+						nodeAddr = nodeAddrChild1;
+						nodeAddrChild1 = tmp;
+					}
+
+					++stackPtr;
+					traversalStack[stackPtr] = nodeAddrChild1;
+				}
+				else {
+					/* one child was intersected */
+					if(traverseChild1) {
+						nodeAddr = nodeAddrChild1;
+					}
+					else if(!traverseChild0) {
+						/* neither child was intersected */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+
+			/* if node is leaf, fetch triangle list */
+			if(nodeAddr < 0) {
+				float4 leaf = kernel_tex_fetch(__bvh_nodes, (-nodeAddr-1)*BVH_NODE_SIZE+(BVH_NODE_SIZE-1));
+				int primAddr = __float_as_int(leaf.x);
+
+#if FEATURE(BVH_INSTANCING)
+				if(primAddr >= 0) {
+#endif
+					int primAddr2 = __float_as_int(leaf.y);
+
+					/* pop */
+					nodeAddr = traversalStack[stackPtr];
+					--stackPtr;
+
+					/* primitive intersection */
+					for(; primAddr < primAddr2; primAddr++) {
+						/* only primitives from volume object */
+						uint tri_object = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, primAddr): object;
+						int object_flag = kernel_tex_fetch(__object_flag, tri_object);
+
+						if((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+							continue;
+						}
+
+						/* intersect ray against primitive */
+						uint type = kernel_tex_fetch(__prim_type, primAddr);
+
+						switch(type & PRIMITIVE_ALL) {
+							case PRIMITIVE_TRIANGLE: {
+								triangle_intersect(kg, isect, P, dir, visibility, object, primAddr);
+								break;
+							}
+#if FEATURE(BVH_MOTION)
+							case PRIMITIVE_MOTION_TRIANGLE: {
+								motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr);
+								break;
+							}
+#endif
+#if FEATURE(BVH_HAIR)
+							case PRIMITIVE_CURVE:
+							case PRIMITIVE_MOTION_CURVE: {
+								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) 
+									bvh_cardinal_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								else
+									bvh_curve_intersect(kg, isect, P, dir, visibility, object, primAddr, ray->time, type, NULL, 0, 0);
+								break;
+							}
+#endif
+							default: {
+								break;
+							}
+						}
+					}
+				}
+#if FEATURE(BVH_INSTANCING)
+				else {
+					/* instance push */
+					object = kernel_tex_fetch(__prim_object, -primAddr-1);
+					int object_flag = kernel_tex_fetch(__object_flag, object);
+
+					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
+#if FEATURE(BVH_MOTION)
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+#if defined(__KERNEL_SSE2__)
+						Psplat[0] = ssef(P.x);
+						Psplat[1] = ssef(P.y);
+						Psplat[2] = ssef(P.z);
+
+						tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+
+						gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+						++stackPtr;
+						traversalStack[stackPtr] = ENTRYPOINT_SENTINEL;
+
+						nodeAddr = kernel_tex_fetch(__object_node, object);
+					}
+					else {
+						/* pop */
+						nodeAddr = traversalStack[stackPtr];
+						--stackPtr;
+					}
+				}
+			}
+#endif
+		} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+#if FEATURE(BVH_INSTANCING)
+		if(stackPtr >= 0) {
+			kernel_assert(object != OBJECT_NONE);
+
+			/* instance pop */
+#if FEATURE(BVH_MOTION)
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_tfm);
+#else
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+#endif
+
+#if defined(__KERNEL_SSE2__)
+			Psplat[0] = ssef(P.x);
+			Psplat[1] = ssef(P.y);
+			Psplat[2] = ssef(P.z);
+
+			tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
+
+			gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
+#endif
+
+			object = OBJECT_NONE;
+			nodeAddr = traversalStack[stackPtr];
+			--stackPtr;
+		}
+#endif
+	} while(nodeAddr != ENTRYPOINT_SENTINEL);
+
+	return (isect->prim != PRIM_NONE);
+}
+
+#undef FEATURE
+#undef BVH_FUNCTION_NAME
+#undef BVH_FUNCTION_FEATURES
+
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index e1d225436a6..b6d21c91916 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -214,9 +214,9 @@ ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta,
 }
 
 #ifdef __KERNEL_SSE2__
-ccl_device_inline __m128 transform_point_T3(const __m128 t[3], const __m128 &a)
+ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
 {
-	return fma(broadcast<0>(a), t[0], fma(broadcast<1>(a), t[1], _mm_mul_ps(broadcast<2>(a), t[2])));
+	return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2]));
 }
 #endif
 
@@ -238,16 +238,16 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 	int prim = kernel_tex_fetch(__prim_index, curveAddr);
 
 #ifdef __KERNEL_SSE2__
-	__m128 vdir = load_m128(dir);
-	__m128 vcurve_coef[4];
+	ssef vdir = load4f(dir);
+	ssef vcurve_coef[4];
 	const float3 *curve_coef = (float3 *)vcurve_coef;
 	
 	{
-		__m128 dtmp = _mm_mul_ps(vdir, vdir);
-		__m128 d_ss = _mm_sqrt_ss(_mm_add_ss(dtmp, broadcast<2>(dtmp)));
-		__m128 rd_ss = _mm_div_ss(_mm_set_ss(1.0f), d_ss);
+		ssef dtmp = vdir * vdir;
+		ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
+		ssef rd_ss = load1f_first(1.0f) / d_ss;
 
-		__m128i v00vec = _mm_load_si128((__m128i *)&kg->__curves.data[prim]);
+		ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]);
 		int2 &v00 = (int2 &)v00vec;
 
 		int k0 = v00.x + segment;
@@ -255,44 +255,44 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 		int ka = max(k0 - 1, v00.x);
 		int kb = min(k1 + 1, v00.x + v00.y - 1);
 
-		__m128 P_curve[4];
+		ssef P_curve[4];
 
 		if(type & PRIMITIVE_CURVE) {
-			P_curve[0] = _mm_load_ps(&kg->__curve_keys.data[ka].x);
-			P_curve[1] = _mm_load_ps(&kg->__curve_keys.data[k0].x);
-			P_curve[2] = _mm_load_ps(&kg->__curve_keys.data[k1].x);
-			P_curve[3] = _mm_load_ps(&kg->__curve_keys.data[kb].x);
+			P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
+			P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
+			P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
+			P_curve[3] = load4f(&kg->__curve_keys.data[kb].x);
 		}
 		else {
 			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
 			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
 		}
 
-		__m128 rd_sgn = set_sign_bit<0, 1, 1, 1>(broadcast<0>(rd_ss));
-		__m128 mul_zxxy = _mm_mul_ps(shuffle<2, 0, 0, 1>(vdir), rd_sgn);
-		__m128 mul_yz = _mm_mul_ps(shuffle<1, 2, 1, 2>(vdir), mul_zxxy);
-		__m128 mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
-		__m128 vdir0 = _mm_and_ps(vdir, _mm_castsi128_ps(_mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0)));
+		ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
+		ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
+		ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy;
+		ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
+		ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
 
-		__m128 htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
-		__m128 htfm1 = shuffle<1, 0, 1, 3>(_mm_set_ss(_mm_cvtss_f32(d_ss)), vdir0);
-		__m128 htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
+		ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
+		ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
+		ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
 
-		__m128 htfm[] = { htfm0, htfm1, htfm2 };
-		__m128 vP = load_m128(P);
-		__m128 p0 = transform_point_T3(htfm, _mm_sub_ps(P_curve[0], vP));
-		__m128 p1 = transform_point_T3(htfm, _mm_sub_ps(P_curve[1], vP));
-		__m128 p2 = transform_point_T3(htfm, _mm_sub_ps(P_curve[2], vP));
-		__m128 p3 = transform_point_T3(htfm, _mm_sub_ps(P_curve[3], vP));
+		ssef htfm[] = { htfm0, htfm1, htfm2 };
+		ssef vP = load4f(P);
+		ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
+		ssef p1 = transform_point_T3(htfm, P_curve[1] - vP);
+		ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
+		ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
 
 		float fc = 0.71f;
-		__m128 vfc = _mm_set1_ps(fc);
-		__m128 vfcxp3 = _mm_mul_ps(vfc, p3);
+		ssef vfc = ssef(fc);
+		ssef vfcxp3 = vfc * p3;
 
 		vcurve_coef[0] = p1;
-		vcurve_coef[1] = _mm_mul_ps(vfc, _mm_sub_ps(p2, p0));
-		vcurve_coef[2] = fma(_mm_set1_ps(fc * 2.0f), p0, fma(_mm_set1_ps(fc - 3.0f), p1, fms(_mm_set1_ps(3.0f - 2.0f * fc), p2, vfcxp3)));
-		vcurve_coef[3] = fms(_mm_set1_ps(fc - 2.0f), _mm_sub_ps(p2, p1), fms(vfc, p0, vfcxp3));
+		vcurve_coef[1] = vfc * (p2 - p0);
+		vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
+		vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
 
 		r_st = ((float4 &)P_curve[1]).w;
 		r_en = ((float4 &)P_curve[2]).w;
@@ -386,12 +386,12 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 		float i_st = tree * resol;
 		float i_en = i_st + (level * resol);
 #ifdef __KERNEL_SSE2__
-		__m128 vi_st = _mm_set1_ps(i_st), vi_en = _mm_set1_ps(i_en);
-		__m128 vp_st = fma(fma(fma(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
-		__m128 vp_en = fma(fma(fma(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
+		ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
+		ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
+		ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
 
-		__m128 vbmin = _mm_min_ps(vp_st, vp_en);
-		__m128 vbmax = _mm_max_ps(vp_st, vp_en);
+		ssef vbmin = min(vp_st, vp_en);
+		ssef vbmax = max(vp_st, vp_en);
 
 		float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
 		float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
@@ -600,13 +600,12 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 #endif
 			{
 				/* record intersection */
+				isect->t = t;
+				isect->u = u;
+				isect->v = gd;
 				isect->prim = curveAddr;
 				isect->object = object;
 				isect->type = type;
-				isect->u = u;
-				isect->v = gd;
-				/*isect->transparency = 1.0f - coverage; */
-				isect->t = t;
 				hit = true;
 			}
 			
@@ -679,38 +678,38 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 	float sphere_b_tmp = dot3(dir, sphere_dif1);
 	float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
 #else
-	__m128 P_curve[2];
+	ssef P_curve[2];
 	
 	if(type & PRIMITIVE_CURVE) {
-		P_curve[0] = _mm_load_ps(&kg->__curve_keys.data[k0].x);
-		P_curve[1] = _mm_load_ps(&kg->__curve_keys.data[k1].x);
+		P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
+		P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
 	}
 	else {
 		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
 		motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve);
 	}
 
-	const __m128 or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
+	const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
 
-	__m128 r12 = or12;
-	const __m128 vP = load_m128(P);
-	const __m128 dif = _mm_sub_ps(vP, P_curve[0]);
-	const __m128 dif_second = _mm_sub_ps(vP, P_curve[1]);
+	ssef r12 = or12;
+	const ssef vP = load4f(P);
+	const ssef dif = vP - P_curve[0];
+	const ssef dif_second = vP - P_curve[1];
 	if(difl != 0.0f) {
-		const __m128 len1_sq = len3_squared_splat(dif);
-		const __m128 len2_sq = len3_squared_splat(dif_second);
-		const __m128 len12 = _mm_sqrt_ps(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
-		const __m128 pixelsize12 = _mm_min_ps(_mm_mul_ps(len12, _mm_set1_ps(difl)), _mm_set1_ps(extmax));
-		r12 = _mm_max_ps(or12, pixelsize12);
+		const ssef len1_sq = len3_squared_splat(dif);
+		const ssef len2_sq = len3_squared_splat(dif_second);
+		const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
+		const ssef pixelsize12 = min(len12 * difl, ssef(extmax));
+		r12 = max(or12, pixelsize12);
 	}
-	float or1 = _mm_cvtss_f32(or12), or2 = _mm_cvtss_f32(broadcast<2>(or12));
-	float r1 = _mm_cvtss_f32(r12), r2 = _mm_cvtss_f32(broadcast<2>(r12));
-
-	const __m128 p21_diff = _mm_sub_ps(P_curve[1], P_curve[0]);
-	const __m128 sphere_dif1 = _mm_mul_ps(_mm_add_ps(dif, dif_second), _mm_set1_ps(0.5f));
-	const __m128 dir = load_m128(direction);
-	const __m128 sphere_b_tmp = dot3_splat(dir, sphere_dif1);
-	const __m128 sphere_dif2 = fnma(sphere_b_tmp, dir, sphere_dif1);
+	float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12));
+	float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12));
+
+	const ssef p21_diff = P_curve[1] - P_curve[0];
+	const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
+	const ssef dir = load4f(direction);
+	const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
+	const ssef sphere_dif2 = nmsub(sphere_b_tmp, dir, sphere_dif1);
 #endif
 
 	float mr = max(r1, r2);
@@ -728,7 +727,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 #ifndef __KERNEL_SSE2__
 	float3 tg = p21_diff * invl;
 #else
-	const __m128 tg = _mm_mul_ps(p21_diff, _mm_set1_ps(invl));
+	const ssef tg = p21_diff * invl;
 #endif
 	float gd = (r2 - r1) * invl;
 
@@ -752,7 +751,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 	float3 cprod = cross(tg, dir);
 	float cprod2sq = len3_squared(cross(tg, dif));
 #else
-	const __m128 cprod = cross(tg, dir);
+	const ssef cprod = cross(tg, dir);
 	float cprod2sq = len3_squared(cross_zxy(tg, dif));
 #endif
 	float cprodsq = len3_squared(cprod);
@@ -770,7 +769,7 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 #ifndef __KERNEL_SSE2__
 	float3 tdif = dif + tcentre * dir;
 #else
-	const __m128 tdif = fma(_mm_set1_ps(tcentre), dir, dif);
+	const ssef tdif = madd(ssef(tcentre), dir, dif);
 #endif
 	float tdifz = dot3(tdif, tg);
 	float tdifma = tdifz*gd + r1;
@@ -836,13 +835,12 @@ ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isec
 #endif
 			{
 				/* record intersection */
+				isect->t = t;
+				isect->u = z*invl;
+				isect->v = gd;
 				isect->prim = curveAddr;
 				isect->object = object;
 				isect->type = type;
-				isect->u = z*invl;
-				isect->v = gd;
-				/*isect->transparency = 1.0f - adjradius;*/
-				isect->t = t;
 
 				return true;
 			}
@@ -938,9 +936,10 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		sd->u = isect->u;
 		sd->v = 0.0f;
 #endif
-	
+
+		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
+
 		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-			tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
 			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
 		}
 		else {
@@ -952,7 +951,6 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 			float gd = isect->v;
 
 			if(gd != 0.0f) {
-				tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
 				sd->Ng = sd->Ng - gd * tg;
 				sd->Ng = normalize(sd->Ng);
 			}
@@ -1012,10 +1010,6 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 	sd->dPdv = cross(tg, sd->Ng);
 #endif
 
-	/*add fading parameter for minimum pixel width with transparency bsdf*/
-	/*sd->curve_transparency = isect->transparency;*/
-	/*sd->curve_radius = sd->u * gd * l + r1;*/
-
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
 		Transform tfm = sd->ob_tfm;
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index 73338bb6b3b..3a4b20e61aa 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -233,8 +233,7 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh
 ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface)
 {
 	/* get shader */
-	float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim);
-	sd->shader = __float_as_int(Ns.w);
+	sd->shader =  kernel_tex_fetch(__tri_shader, sd->prim);
 
 	/* get motion info */
 	int numsteps, numverts;
@@ -273,7 +272,11 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
 #endif
 
 	/* compute face normal */
-	float3 Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
+	float3 Ng;
+	if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
+	else
+		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
 
 	sd->Ng = Ng;
 	sd->N = Ng;
@@ -327,14 +330,21 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection
 	float t, u, v;
 
 	if(ray_triangle_intersect_uv(P, dir, isect->t, verts[2], verts[0], verts[1], &u, &v, &t)) {
-		isect->prim = triAddr;
-		isect->object = object;
-		isect->type = PRIMITIVE_MOTION_TRIANGLE;
-		isect->u = u;
-		isect->v = v;
-		isect->t = t;
+#ifdef __VISIBILITY_FLAG__
+		/* visibility flag test. we do it here under the assumption
+		 * that most triangles are culled by node flags */
+		if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
+#endif
+		{
+			isect->t = t;
+			isect->u = u;
+			isect->v = v;
+			isect->prim = triAddr;
+			isect->object = object;
+			isect->type = PRIMITIVE_MOTION_TRIANGLE;
 		
-		return true;
+			return true;
+		}
 	}
 
 	return false;
@@ -378,12 +388,12 @@ ccl_device_inline void motion_triangle_intersect_subsurface(KernelGlobals *kg, I
 
 		/* record intersection */
 		Intersection *isect = &isect_array[hit];
+		isect->t = t;
+		isect->u = u;
+		isect->v = v;
 		isect->prim = triAddr;
 		isect->object = object;
 		isect->type = PRIMITIVE_MOTION_TRIANGLE;
-		isect->u = u;
-		isect->v = v;
-		isect->t = t;
 	}
 }
 #endif
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index 533973621d7..5df6c75df86 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -143,6 +143,7 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 	/* center position */
 	float3 center;
 
+#ifdef __HAIR__
 	if(sd->type & PRIMITIVE_ALL_CURVE) {
 		center = curve_motion_center_location(kg, sd);
 
@@ -150,6 +151,7 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
 			object_position_transform(kg, sd, &center);
 	}
 	else
+#endif
 		center = sd->P;
 
 	float3 motion_pre = center, motion_post = center;
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 355e36fef0c..c08a82ee038 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -18,7 +18,7 @@
 /* Triangle Primitive
  *
  * Basic triangle with 3 vertices is used to represent mesh surfaces. For BVH
- * ray intersection we use a precomputed triangle storage to accelarate
+ * ray intersection we use a precomputed triangle storage to accelerate
  * intersection at the cost of more memory usage */
 
 CCL_NAMESPACE_BEGIN
@@ -116,11 +116,28 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, ShaderDat
 #endif
 }
 
+/* normal on triangle  */
+ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
+{
+	/* load triangle vertices */
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+
+	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
+	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
+	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
+	
+	/* return normal */
+	if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+		return normalize(cross(v2 - v0, v1 - v0));
+	else
+		return normalize(cross(v1 - v0, v2 - v0));
+}
+
 /* point and normal on triangle  */
-ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
+ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
 {
 	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
 
 	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
 	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
@@ -130,16 +147,24 @@ ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int prim, float
 	float t = 1.0f - u - v;
 	*P = (u*v0 + v*v1 + t*v2);
 
-	float4 Nm = kernel_tex_fetch(__tri_normal, prim);
-	*Ng = make_float3(Nm.x, Nm.y, Nm.z);
-	*shader = __float_as_int(Nm.w);
+	/* get object flags, instance-aware */
+	int object_flag = kernel_tex_fetch(__object_flag, object >= 0 ? object : ~object);
+
+	/* compute normal */
+	if(object_flag & SD_NEGATIVE_SCALE_APPLIED)
+		*Ng = normalize(cross(v2 - v0, v1 - v0));
+	else
+		*Ng = normalize(cross(v1 - v0, v2 - v0));
+
+	/* shader`*/
+	*shader = kernel_tex_fetch(__tri_shader, prim);
 }
 
 /* Triangle vertex locations */
 
 ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3])
 {
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
 
 	P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
 	P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
@@ -151,7 +176,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
 ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
 {
 	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
 
 	float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.x)));
 	float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, __float_as_int(tri_vindex.y)));
@@ -165,7 +190,7 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
 ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, float3 *dPdu, float3 *dPdv)
 {
 	/* fetch triangle vertex coordinates */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+	float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
 
 	float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
 	float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
@@ -187,7 +212,7 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
 		return kernel_tex_fetch(__attributes_float, offset + sd->prim);
 	}
 	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
-		float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 		float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x));
 		float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y));
@@ -230,7 +255,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
 	}
 	else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
-		float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+		float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
@@ -243,11 +268,20 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 
 		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
-	else if(elem == ATTR_ELEMENT_CORNER) {
+	else if(elem == ATTR_ELEMENT_CORNER || elem == ATTR_ELEMENT_CORNER_BYTE) {
 		int tri = offset + sd->prim*3;
-		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
-		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
-		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
+		float3 f0, f1, f2;
+
+		if(elem == ATTR_ELEMENT_CORNER) {
+			f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
+			f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
+			f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
+		}
+		else {
+			f0 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 0));
+			f1 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 1));
+			f2 = color_byte_to_float(kernel_tex_fetch(__attributes_uchar4, tri + 2));
+		}
 
 #ifdef __RAY_DIFFERENTIALS__
 		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
@@ -300,12 +334,12 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg, Intersection *isect
 #endif
 				{
 					/* record intersection */
+					isect->t = t;
+					isect->u = u;
+					isect->v = v;
 					isect->prim = triAddr;
 					isect->object = object;
 					isect->type = PRIMITIVE_TRIANGLE;
-					isect->u = u;
-					isect->v = v;
-					isect->t = t;
 					return true;
 				}
 			}
@@ -363,12 +397,12 @@ ccl_device_inline void triangle_intersect_subsurface(KernelGlobals *kg, Intersec
 
 				/* record intersection */
 				Intersection *isect = &isect_array[hit];
+				isect->t = t;
+				isect->u = u;
+				isect->v = v;
 				isect->prim = triAddr;
 				isect->object = object;
 				isect->type = PRIMITIVE_TRIANGLE;
-				isect->u = u;
-				isect->v = v;
-				isect->t = t;
 			}
 		}
 	}
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 963d6cbee9c..3cb6d168f80 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -49,7 +49,15 @@ ccl_device float3 volume_normalized_position(KernelGlobals *kg, const ShaderData
 ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int id, float *dx, float *dy)
 {
 	float3 P = volume_normalized_position(kg, sd, sd->P);
-	float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+#ifdef __KERNEL_GPU__
+	float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#else
+	float4 r;
+	if(sd->flag & SD_VOLUME_CUBIC)
+		r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC);
+	else
+		r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+#endif
 
 	if(dx) *dx = 0.0f;
 	if(dx) *dy = 0.0f;
@@ -61,7 +69,15 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int id, float3 *dx, float3 *dy)
 {
 	float3 P = volume_normalized_position(kg, sd, sd->P);
-	float4 r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+#ifdef __KERNEL_GPU__
+	float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#else
+	float4 r;
+	if(sd->flag & SD_VOLUME_CUBIC)
+		r = kernel_tex_image_interp_3d_ex(id, P.x, P.y, P.z, INTERPOLATION_CUBIC);
+	else
+		r = kernel_tex_image_interp_3d(id, P.x, P.y, P.z);
+#endif
 
 	if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
 	if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernel.cl
index 6988ad6027f..4f20ef9ca15 100644
--- a/intern/cycles/kernel/kernel.cl
+++ b/intern/cycles/kernel/kernel.cl
@@ -23,7 +23,7 @@
 
 #include "kernel_film.h"
 #include "kernel_path.h"
-#include "kernel_displace.h"
+#include "kernel_bake.h"
 
 __kernel void kernel_ocl_path_trace(
 	ccl_constant KernelData *data,
@@ -115,7 +115,7 @@ __kernel void kernel_ocl_shader(
 	ccl_global type *name,
 #include "kernel_textures.h"
 
-	int type, int sx, int sw)
+	int type, int sx, int sw, int offset, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -128,6 +128,31 @@ __kernel void kernel_ocl_shader(
 	int x = sx + get_global_id(0);
 
 	if(x < sx + sw)
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x);
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+}
+
+__kernel void kernel_ocl_bake(
+	ccl_constant KernelData *data,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
+
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name,
+#include "kernel_textures.h"
+
+	int type, int sx, int sw, int offset, int sample)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+#define KERNEL_TEX(type, ttype, name) \
+	kg->name = name;
+#include "kernel_textures.h"
+
+	int x = sx + get_global_id(0);
+
+	if(x < sx + sw)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
 }
 
diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernel.cpp
index 173028d50c8..fa2113fbb46 100644
--- a/intern/cycles/kernel/kernel.cpp
+++ b/intern/cycles/kernel/kernel.cpp
@@ -23,7 +23,7 @@
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
-#include "kernel_displace.h"
+#include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -120,9 +120,12 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu
 
 /* Shader Evaluation */
 
-void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
 {
-	kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu
index 636e48b5456..489daacddde 100644
--- a/intern/cycles/kernel/kernel.cu
+++ b/intern/cycles/kernel/kernel.cu
@@ -22,7 +22,7 @@
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
-#include "kernel_displace.h"
+#include "kernel_bake.h"
 
 /* device data taken from CUDA occupancy calculator */
 
@@ -52,8 +52,20 @@
 #define CUDA_KERNEL_MAX_REGISTERS 63
 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
 
-/* 5.0 */
-#elif __CUDA_ARCH__ == 500
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#define CUDA_BLOCK_MAX_THREADS 1024
+#define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#define CUDA_THREADS_BLOCK_WIDTH 16
+#define CUDA_KERNEL_MAX_REGISTERS 63
+#define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 5.0 and 5.2 */
+#elif __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 520
 #define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
 #define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
 #define CUDA_BLOCK_MAX_THREADS 1024
@@ -61,12 +73,12 @@
 
 /* tunable parameters */
 #define CUDA_THREADS_BLOCK_WIDTH 16
-#define CUDA_KERNEL_MAX_REGISTERS 63
+#define CUDA_KERNEL_MAX_REGISTERS 40
 #define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
 
 /* unknown architecture */
 #else
-#error "Unknown or unuspported CUDA architecture, can't determine launch bounds"
+#error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
 #endif
 
 /* compute number of threads per block and minimum blocks per multiprocessor
@@ -146,11 +158,22 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx)
+kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+
+	if(x < sx + sw)
+		kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
-	kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x);
+	if(x < sx + sw)
+		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x, offset, sample);
 }
 
 #endif
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index c4a08646bab..19e06b88797 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -41,7 +41,7 @@ void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int offset, int sample);
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
@@ -51,7 +51,7 @@ void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
 void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int offset, int sample);
 #endif
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
@@ -62,7 +62,7 @@ void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
 void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int offset, int sample);
 #endif
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
@@ -73,7 +73,7 @@ void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *bu
 void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int offset, int sample);
 #endif
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
@@ -84,7 +84,18 @@ void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buff
 void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i);
+	int type, int i, int offset, int sample);
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
+	int sample, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+	float sample_scale, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+	float sample_scale, int x, int y, int offset, int stride);
+void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
+	int type, int i, int offset, int sample);
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index b4f6dcdace9..b0efcdc66a7 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -32,10 +32,11 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
 		eval->transmission = make_float3(0.0f, 0.0f, 0.0f);
 		eval->transparent = make_float3(0.0f, 0.0f, 0.0f);
 		eval->subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		eval->scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		if(type == CLOSURE_BSDF_TRANSPARENT_ID)
 			eval->transparent = value;
-		else if(CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_PHASE(type))
+		else if(CLOSURE_IS_BSDF_DIFFUSE(type))
 			eval->diffuse = value;
 		else if(CLOSURE_IS_BSDF_GLOSSY(type))
 			eval->glossy = value;
@@ -43,6 +44,8 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
 			eval->transmission = value;
 		else if(CLOSURE_IS_BSDF_BSSRDF(type))
 			eval->subsurface = value;
+		else if(CLOSURE_IS_PHASE(type))
+			eval->scatter = value;
 	}
 	else
 		eval->diffuse = value;
@@ -51,11 +54,17 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
 #endif
 }
 
-ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
+/* TODO(sergey): This is just a workaround for annoying 6.5 compiler bug. */
+#if !defined(__KERNEL_CUDA__) || __CUDA_ARCH__ < 500
+ccl_device_inline
+#else
+ccl_device_noinline
+#endif
+void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
-		if(CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_PHASE(type))
+		if(CLOSURE_IS_BSDF_DIFFUSE(type))
 			eval->diffuse += value;
 		else if(CLOSURE_IS_BSDF_GLOSSY(type))
 			eval->glossy += value;
@@ -63,6 +72,8 @@ ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3
 			eval->transmission += value;
 		else if(CLOSURE_IS_BSDF_BSSRDF(type))
 			eval->subsurface += value;
+		else if(CLOSURE_IS_PHASE(type))
+			eval->scatter += value;
 
 		/* skipping transparent, this function is used by for eval(), will be zero then */
 	}
@@ -81,7 +92,8 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 			&& is_zero(eval->glossy)
 			&& is_zero(eval->transmission)
 			&& is_zero(eval->transparent)
-			&& is_zero(eval->subsurface);
+			&& is_zero(eval->subsurface)
+			&& is_zero(eval->scatter);
 	}
 	else
 		return is_zero(eval->diffuse);
@@ -98,6 +110,7 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value)
 		eval->glossy *= value;
 		eval->transmission *= value;
 		eval->subsurface *= value;
+		eval->scatter *= value;
 
 		/* skipping transparent, this function is used by for eval(), will be zero then */
 	}
@@ -111,7 +124,7 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value)
 /* Path Radiance
  *
  * We accumulate different render passes separately. After summing at the end
- * to get the combined result, it should be identical. We definte directly
+ * to get the combined result, it should be identical. We definite directly
  * visible as the first non-transparent hit, while indirectly visible are the
  * bounces after that. */
 
@@ -130,21 +143,25 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 		L->color_glossy = make_float3(0.0f, 0.0f, 0.0f);
 		L->color_transmission = make_float3(0.0f, 0.0f, 0.0f);
 		L->color_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->color_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->direct_diffuse = make_float3(0.0f, 0.0f, 0.0f);
 		L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f);
 		L->direct_transmission = make_float3(0.0f, 0.0f, 0.0f);
 		L->direct_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->direct_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->indirect_diffuse = make_float3(0.0f, 0.0f, 0.0f);
 		L->indirect_glossy = make_float3(0.0f, 0.0f, 0.0f);
 		L->indirect_transmission = make_float3(0.0f, 0.0f, 0.0f);
 		L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f);
 		L->path_glossy = make_float3(0.0f, 0.0f, 0.0f);
 		L->path_transmission = make_float3(0.0f, 0.0f, 0.0f);
 		L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->path_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->emission = make_float3(0.0f, 0.0f, 0.0f);
 		L->background = make_float3(0.0f, 0.0f, 0.0f);
@@ -174,14 +191,16 @@ ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throug
 			L->path_glossy = bsdf_eval->glossy*value;
 			L->path_transmission = bsdf_eval->transmission*value;
 			L->path_subsurface = bsdf_eval->subsurface*value;
+			L->path_scatter = bsdf_eval->scatter*value;
 
-			*throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface;
+			*throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface + L->path_scatter;
 			
 			L->direct_throughput = *throughput;
 		}
 		else {
 			/* transparent bounce before first hit, or indirectly visible through BSDF */
-			float3 sum = (bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->transparent + bsdf_eval->subsurface)*inverse_pdf;
+			float3 sum = (bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->transparent +
+						  bsdf_eval->subsurface + bsdf_eval->scatter) * inverse_pdf;
 			*throughput *= sum;
 		}
 	}
@@ -241,6 +260,7 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
 			L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
 			L->direct_transmission += throughput*bsdf_eval->transmission*shadow;
 			L->direct_subsurface += throughput*bsdf_eval->subsurface*shadow;
+			L->direct_scatter += throughput*bsdf_eval->scatter*shadow;
 
 			if(is_lamp) {
 				L->shadow.x += shadow.x*shadow_fac;
@@ -250,7 +270,7 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
 		}
 		else {
 			/* indirectly visible lighting after BSDF bounce */
-			float3 sum = bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->subsurface;
+			float3 sum = bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->subsurface + bsdf_eval->scatter;
 			L->indirect += throughput*sum*shadow;
 		}
 	}
@@ -291,12 +311,14 @@ ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
 		L->direct_glossy += L->path_glossy*L->direct_emission;
 		L->direct_transmission += L->path_transmission*L->direct_emission;
 		L->direct_subsurface += L->path_subsurface*L->direct_emission;
+		L->direct_scatter += L->path_scatter*L->direct_emission;
 
 		L->indirect = safe_divide_color(L->indirect, L->direct_throughput);
 		L->indirect_diffuse += L->path_diffuse*L->indirect;
 		L->indirect_glossy += L->path_glossy*L->indirect;
 		L->indirect_transmission += L->path_transmission*L->indirect;
 		L->indirect_subsurface += L->path_subsurface*L->indirect;
+		L->indirect_scatter += L->path_scatter*L->indirect;
 	}
 #endif
 }
@@ -309,6 +331,7 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
 		L->path_glossy = make_float3(0.0f, 0.0f, 0.0f);
 		L->path_transmission = make_float3(0.0f, 0.0f, 0.0f);
 		L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->path_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->direct_emission = make_float3(0.0f, 0.0f, 0.0f);
 		L->indirect = make_float3(0.0f, 0.0f, 0.0f);
@@ -327,8 +350,8 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 	if(L->use_light_pass) {
 		path_radiance_sum_indirect(L);
 
-		L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_subsurface + L->emission;
-		L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission + L->indirect_subsurface;
+		L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_subsurface + L->direct_scatter + L->emission;
+		L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission + L->indirect_subsurface + L->indirect_scatter;
 
 		if(!kernel_data.background.transparent)
 			L_direct += L->background;
@@ -344,11 +367,13 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 			L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f);
 			L->direct_transmission = make_float3(0.0f, 0.0f, 0.0f);
 			L->direct_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+			L->direct_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 			L->indirect_diffuse = make_float3(0.0f, 0.0f, 0.0f);
 			L->indirect_glossy = make_float3(0.0f, 0.0f, 0.0f);
 			L->indirect_transmission = make_float3(0.0f, 0.0f, 0.0f);
 			L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+			L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 			L->emission = make_float3(0.0f, 0.0f, 0.0f);
 		}
@@ -368,6 +393,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 				L->direct_glossy *= scale;
 				L->direct_transmission *= scale;
 				L->direct_subsurface *= scale;
+				L->direct_scatter *= scale;
 				L->emission *= scale;
 				L->background *= scale;
 			}
@@ -382,6 +408,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 				L->indirect_glossy *= scale;
 				L->indirect_transmission *= scale;
 				L->indirect_subsurface *= scale;
+				L->indirect_scatter *= scale;
 			}
 
 			/* Sum again, after clamping */
@@ -416,11 +443,13 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance
 	L->direct_glossy += L_sample->direct_glossy*fac;
 	L->direct_transmission += L_sample->direct_transmission*fac;
 	L->direct_subsurface += L_sample->direct_subsurface*fac;
+	L->direct_scatter += L_sample->direct_scatter*fac;
 
 	L->indirect_diffuse += L_sample->indirect_diffuse*fac;
 	L->indirect_glossy += L_sample->indirect_glossy*fac;
 	L->indirect_transmission += L_sample->indirect_transmission*fac;
 	L->indirect_subsurface += L_sample->indirect_subsurface*fac;
+	L->indirect_scatter += L_sample->indirect_scatter*fac;
 
 	L->emission += L_sample->emission*fac;
 	L->background += L_sample->background*fac;
diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernel_avx.cpp
index 354214c406e..e7ff21a6f09 100644
--- a/intern/cycles/kernel/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernel_avx.cpp
@@ -24,6 +24,7 @@
 #define __KERNEL_SSE3__
 #define __KERNEL_SSSE3__
 #define __KERNEL_SSE41__
+#define __KERNEL_AVX__
 #endif
  
 #include "util_optimization.h"
@@ -37,7 +38,7 @@
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
-#include "kernel_displace.h"
+#include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -67,9 +68,12 @@ void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float
 
 /* Shader Evaluate */
 
-void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
 {
-	kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernel_avx2.cpp
new file mode 100644
index 00000000000..cb1662bbfbe
--- /dev/null
+++ b/intern/cycles/kernel/kernel_avx2.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+ 
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#define __KERNEL_SSE2__
+#define __KERNEL_SSE3__
+#define __KERNEL_SSSE3__
+#define __KERNEL_SSE41__
+#define __KERNEL_AVX__
+#define __KERNEL_AVX2__
+#endif
+ 
+#include "util_optimization.h"
+ 
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+
+#include "kernel.h"
+#include "kernel_compat_cpu.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_film.h"
+#include "kernel_path.h"
+#include "kernel_bake.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Path Tracing */
+
+void kernel_cpu_avx2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state, int sample, int x, int y, int offset, int stride)
+{
+#ifdef __BRANCHED_PATH__
+	if(kernel_data.integrator.branched)
+		kernel_branched_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+	else
+#endif
+		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+}
+
+/* Film */
+
+void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
+{
+	kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+/* Shader Evaluate */
+
+void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
+{
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+}
+
+CCL_NAMESPACE_END
+#else
+
+/* needed for some linkers in combination with scons making empty compilation unit in a library */
+void __dummy_function_cycles_avx2(void);
+void __dummy_function_cycles_avx2(void) {}
+
+#endif
diff --git a/intern/cycles/kernel/kernel_displace.h b/intern/cycles/kernel/kernel_bake.h
index b8c64af658f..a1ec080e3d3 100644
--- a/intern/cycles/kernel/kernel_displace.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -17,65 +17,125 @@
 CCL_NAMESPACE_BEGIN
 
 ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng,
-                                   bool is_combined, bool is_ao, bool is_sss)
+                                   const bool is_combined, const bool is_ao, const bool is_sss, int sample)
 {
-	int samples = kernel_data.integrator.aa_samples;
-
 	/* initialize master radiance accumulator */
 	kernel_assert(kernel_data.film.use_light_pass);
 	path_radiance_init(L, kernel_data.film.use_light_pass);
 
-	/* take multiple samples */
-	for(int sample = 0; sample < samples; sample++) {
-		PathRadiance L_sample;
-		PathState state;
-		Ray ray;
-		float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+	PathRadiance L_sample;
+	PathState state;
+	Ray ray;
+	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+	bool is_sss_sample = is_sss;
 
-		/* init radiance */
-		path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
+	/* init radiance */
+	path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
 
-		/* init path state */
-		path_state_init(kg, &state, &rng, sample);
-		state.num_samples = samples;
+	/* init path state */
+	path_state_init(kg, &state, &rng, sample, NULL);
 
-		/* evaluate surface shader */
-		float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
-		shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
+	/* evaluate surface shader */
+	float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
+	shader_eval_surface(kg, sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
 
-		/* TODO, disable the closures we won't need */
+	/* TODO, disable the closures we won't need */
+
+#ifdef __BRANCHED_PATH__
+	if(!kernel_data.integrator.branched) {
+		/* regular path tracer */
+#endif
 
 		/* sample ambient occlusion */
 		if(is_combined || is_ao) {
 			kernel_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
 		}
 
-		/* sample subsurface scattering */
-		if((is_combined || is_sss) && (sd->flag & SD_BSSRDF)) {
 #ifdef __SUBSURFACE__
+		/* sample subsurface scattering */
+		if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
 			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
 			if (kernel_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, &throughput))
-				is_sss = true;
-#endif
+				is_sss_sample = true;
 		}
+#endif
 
 		/* sample light and BSDF */
-		if((!is_sss) && (!is_ao)) {
-			if(kernel_path_integrate_lighting(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
+		if((!is_sss_sample) && (!is_ao)) {
+
+			if(sd->flag & SD_EMISSION) {
+				float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
+				path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+			}
+
+			kernel_path_surface_connect_light(kg, &rng, sd, throughput, &state, &L_sample);
+
+			if(kernel_path_surface_bounce(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
 #ifdef __LAMP_MIS__
 				state.ray_t = 0.0f;
 #endif
 				/* compute indirect light */
-				kernel_path_indirect(kg, &rng, ray, throughput, state.num_samples, state, &L_sample);
+				kernel_path_indirect(kg, &rng, ray, throughput, 1, state, &L_sample);
 
 				/* sum and reset indirect light pass variables for the next samples */
 				path_radiance_sum_indirect(&L_sample);
 				path_radiance_reset_indirect(&L_sample);
 			}
 		}
+#ifdef __BRANCHED_PATH__
+	}
+	else {
+		/* branched path tracer */
+
+		/* sample ambient occlusion */
+		if(is_combined || is_ao) {
+			kernel_branched_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
+		}
+
+#ifdef __SUBSURFACE__
+		/* sample subsurface scattering */
+		if((is_combined || is_sss_sample) && (sd->flag & SD_BSSRDF)) {
+			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
+			kernel_branched_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, throughput);
+		}
+#endif
+
+		/* sample light and BSDF */
+		if((!is_sss_sample) && (!is_ao)) {
+
+			if(sd->flag & SD_EMISSION) {
+				float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
+				path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+			}
+
+#if defined(__EMISSION__)
+			/* direct light */
+			if(kernel_data.integrator.use_direct_light) {
+				bool all = kernel_data.integrator.sample_all_lights_direct;
+				kernel_branched_path_surface_connect_light(kg, &rng,
+					sd, &state, throughput, 1.0f, &L_sample, all);
+			}
+#endif
+
+			/* indirect light */
+			kernel_branched_path_surface_indirect_light(kg, &rng,
+				sd, throughput, 1.0f, &state, &L_sample);
+		}
+	}
+#endif
+
+	/* accumulate into master L */
+	path_radiance_accum_sample(L, &L_sample, 1);
+}
 
-		/* accumulate into master L */
-		path_radiance_accum_sample(L, &L_sample, samples);
+ccl_device bool is_aa_pass(ShaderEvalType type)
+{
+	switch(type) {
+		case SHADER_EVAL_UV:
+		case SHADER_EVAL_NORMAL:
+			return false;
+		default:
+			return true;
 	}
 }
 
@@ -99,7 +159,21 @@ ccl_device bool is_light_pass(ShaderEvalType type)
 	}
 }
 
-ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i)
+#if 0
+ccl_device_inline float bake_clamp_mirror_repeat(float u)
+{
+	/* use mirror repeat (like opengl texture) so that if the barycentric
+	 * coordinate goes past the end of the triangle it is not always clamped
+	 * to the same value, gives ugly patterns */
+	float fu = floorf(u);
+	u = u - fu;
+
+	return (((int)fu) & 1)? 1.0f - u: u;
+}
+#endif
+
+ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output,
+                                     ShaderEvalType type, int i, int offset, int sample)
 {
 	ShaderData sd;
 	uint4 in = input[i * 2];
@@ -121,10 +195,28 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	float dvdx = __uint_as_float(diff.z);
 	float dvdy = __uint_as_float(diff.w);
 
+	int num_samples = kernel_data.integrator.aa_samples;
+
+	/* random number generator */
+	RNG rng = cmj_hash(offset + i, 0);
+
+#if 0
+	uint rng_state = cmj_hash(i, 0);
+	float filter_x, filter_y;
+	path_rng_init(kg, &rng_state, sample, num_samples, &rng, 0, 0, &filter_x, &filter_y);
+
+	/* subpixel u/v offset */
+	if(sample > 0) {
+		u = bake_clamp_mirror_repeat(u + dudx*(filter_x - 0.5f) + dudy*(filter_y - 0.5f));
+		v = bake_clamp_mirror_repeat(v + dvdx*(filter_x - 0.5f) + dvdy*(filter_y - 0.5f));
+	}
+#endif
+
+	/* triangle */
 	int shader;
 	float3 P, Ng;
 
-	triangle_point_normal(kg, prim, u, v, &P, &Ng, &shader);
+	triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
 
 	/* dummy initilizations copied from SHADER_EVAL_DISPLACE */
 	float3 I = Ng;
@@ -147,12 +239,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	sd.dv.dx = dvdx;
 	sd.dv.dy = dvdy;
 
+	/* light passes */
 	if(is_light_pass(type)) {
-		RNG rng = cmj_hash(i, 0);
-		compute_light_pass(kg, &sd, &L, rng, (type == SHADER_EVAL_COMBINED),
-		                                     (type == SHADER_EVAL_AO),
-		                                     (type == SHADER_EVAL_SUBSURFACE_DIRECT ||
-		                                      type == SHADER_EVAL_SUBSURFACE_INDIRECT));
+		compute_light_pass(kg, &sd, &L, rng,
+		                   (type == SHADER_EVAL_COMBINED),
+		                   (type == SHADER_EVAL_AO),
+		                   (type == SHADER_EVAL_SUBSURFACE_DIRECT ||
+		                    type == SHADER_EVAL_SUBSURFACE_INDIRECT),
+		                   sample);
 	}
 
 	switch (type) {
@@ -307,17 +401,16 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	}
 
 	/* write output */
-	output[i] = make_float4(out.x, out.y, out.z, 1.0f);
-	return;
+	float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f;
+
+	if(sample == 0)
+		output[i] = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
+	else
+		output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
 }
 
-ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i)
+ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample)
 {
-	if(type >= SHADER_EVAL_BAKE) {
-		kernel_bake_evaluate(kg, input, output, type, i);
-		return;
-	}
-
 	ShaderData sd;
 	uint4 in = input[i];
 	float3 out;
@@ -363,7 +456,10 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *inpu
 	}
 	
 	/* write output */
-	output[i] = make_float4(out.x, out.y, out.z, 0.0f);
+	if(sample == 0)
+		output[i] = make_float4(out.x, out.y, out.z, 0.0f);
+	else
+		output[i] += make_float4(out.x, out.y, out.z, 0.0f);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index 7fc66a9fdee..5c83358a56d 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -21,16 +21,22 @@ CCL_NAMESPACE_BEGIN
 ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
 {
 	float blades = kernel_data.cam.blades;
+	float2 bokeh;
 
 	if(blades == 0.0f) {
 		/* sample disk */
-		return concentric_sample_disk(u, v);
+		bokeh = concentric_sample_disk(u, v);
 	}
 	else {
 		/* sample polygon */
 		float rotation = kernel_data.cam.bladesrotation;
-		return regular_polygon_sample(blades, rotation, u, v);
+		bokeh = regular_polygon_sample(blades, rotation, u, v);
 	}
+
+	/* anamorphic lens bokeh */
+	bokeh.x *= kernel_data.cam.inv_aperture_ratio;
+
+	return bokeh;
 }
 
 ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
@@ -183,7 +189,8 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float
 
 		/* calculate orthonormal coordinates perpendicular to D */
 		float3 U, V;
-		make_orthonormals(D, &U, &V);
+		U = normalize(make_float3(1.0f, 0.0f, 0.0f) -  D.x * D);
+		V = normalize(cross(D, U));
 
 		/* update ray for effect of lens */
 		ray->P = U * lensuv.x + V * lensuv.y;
@@ -262,6 +269,20 @@ ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P)
 		return len(P - camP);
 }
 
+ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P)
+{
+	Transform cameratoworld = kernel_data.cam.cameratoworld;
+
+	if(kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) {
+		float3 camD = make_float3(cameratoworld.x.z, cameratoworld.y.z, cameratoworld.z.z);
+		return -camD;
+	}
+	else {
+		float3 camP = make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
+		return normalize(camP - P);
+	}
+}
+
 ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, float3 P)
 {
 	if(kernel_data.cam.type != CAMERA_PANORAMA) {
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index d027bb62ebe..37cba03ff97 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -25,6 +25,13 @@
 #include "util_half.h"
 #include "util_types.h"
 
+/* On 64bit linux single precision exponent is really slow comparing to the
+ * double precision version, even with float<->double conversion involved.
+ */
+#if !defined(__KERNEL_GPU__) && defined(__linux__) && defined(__x86_64__)
+#  define expf(x) ((float)exp((double)(x)))
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* Assertions inside the kernel only work for the CPU device, so we wrap it in
@@ -44,16 +51,16 @@ template<typename T> struct texture  {
 	}
 
 #if 0
-	ccl_always_inline __m128 fetch_m128(int index)
+	ccl_always_inline ssef fetch_ssef(int index)
 	{
 		kernel_assert(index >= 0 && index < width);
-		return ((__m128*)data)[index];
+		return ((ssef*)data)[index];
 	}
 
-	ccl_always_inline __m128i fetch_m128i(int index)
+	ccl_always_inline ssei fetch_ssei(int index)
 	{
 		kernel_assert(index >= 0 && index < width);
-		return ((__m128i*)data)[index];
+		return ((ssei*)data)[index];
 	}
 #endif
 
@@ -144,6 +151,13 @@ template<typename T> struct texture_image  {
 
 	ccl_always_inline float4 interp_3d(float x, float y, float z, bool periodic = false)
 	{
+		return interp_3d_ex(x, y, z, interpolation, periodic);
+	}
+
+	ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
+	                                      int interpolation = INTERPOLATION_LINEAR,
+	                                      bool periodic = false)
+	{
 		if(UNLIKELY(!data))
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 
@@ -167,7 +181,7 @@ template<typename T> struct texture_image  {
 
 			return read(data[ix + iy*width + iz*width*height]);
 		}
-		else {
+		else if(interpolation == INTERPOLATION_LINEAR) {
 			float tx = frac(x*(float)width - 0.5f, &ix);
 			float ty = frac(y*(float)height - 0.5f, &iy);
 			float tz = frac(z*(float)depth - 0.5f, &iz);
@@ -205,6 +219,93 @@ template<typename T> struct texture_image  {
 
 			return r;
 		}
+		else {
+			/* Tricubic b-spline interpolation. */
+			const float tx = frac(x*(float)width - 0.5f, &ix);
+			const float ty = frac(y*(float)height - 0.5f, &iy);
+			const float tz = frac(z*(float)depth - 0.5f, &iz);
+			int pix, piy, piz, nnix, nniy, nniz;
+
+			if(periodic) {
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+				iz = wrap_periodic(iz, depth);
+
+				pix = wrap_periodic(ix-1, width);
+				piy = wrap_periodic(iy-1, height);
+				piz = wrap_periodic(iz-1, depth);
+
+				nix = wrap_periodic(ix+1, width);
+				niy = wrap_periodic(iy+1, height);
+				niz = wrap_periodic(iz+1, depth);
+
+				nnix = wrap_periodic(ix+2, width);
+				nniy = wrap_periodic(iy+2, height);
+				nniz = wrap_periodic(iz+2, depth);
+			}
+			else {
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+				iz = wrap_clamp(iz, depth);
+
+				pix = wrap_clamp(ix-1, width);
+				piy = wrap_clamp(iy-1, height);
+				piz = wrap_clamp(iz-1, depth);
+
+				nix = wrap_clamp(ix+1, width);
+				niy = wrap_clamp(iy+1, height);
+				niz = wrap_clamp(iz+1, depth);
+
+				nnix = wrap_clamp(ix+2, width);
+				nniy = wrap_clamp(iy+2, height);
+				nniz = wrap_clamp(iz+2, depth);
+			}
+
+			const int xc[4] = {pix, ix, nix, nnix};
+			const int yc[4] = {width * piy,
+			                   width * iy,
+			                   width * niy,
+			                   width * nniy};
+			const int zc[4] = {width * height * piz,
+			                   width * height * iz,
+			                   width * height * niz,
+			                   width * height * nniz};
+			float u[4], v[4], w[4];
+
+			/* Some helper macro to keep code reasonable size,
+			 * let compiler to inline all the matrix multiplications.
+			 */
+#define SET_SPLINE_WEIGHTS(u, t) \
+			{ \
+				u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
+				u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
+				u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
+				u[3] = (1.0f / 6.0f) * t * t * t; \
+			} (void)0
+#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
+#define COL_TERM(col, row) \
+			(v[col] * (u[0] * DATA(0, col, row) + \
+			           u[1] * DATA(1, col, row) + \
+			           u[2] * DATA(2, col, row) + \
+			           u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+			(w[row] * (COL_TERM(0, row) + \
+			           COL_TERM(1, row) + \
+			           COL_TERM(2, row) + \
+			           COL_TERM(3, row)))
+
+			SET_SPLINE_WEIGHTS(u, tx);
+			SET_SPLINE_WEIGHTS(v, ty);
+			SET_SPLINE_WEIGHTS(w, tz);
+
+			/* Actual interpolation. */
+			return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+#undef SET_SPLINE_WEIGHTS
+		}
 	}
 
 	ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
@@ -232,11 +333,12 @@ typedef texture_image<uchar4> texture_image_uchar4;
 /* Macros to handle different memory storage on different devices */
 
 #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
-#define kernel_tex_fetch_m128(tex, index) (kg->tex.fetch_m128(index))
-#define kernel_tex_fetch_m128i(tex, index) (kg->tex.fetch_m128i(index))
+#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
+#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
 #define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y))
 #define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z))
+#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d_ex(x, y, z, interpolation) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d_ex(x, y, z, interpolation))
 
 #define kernel_data (kg->__data)
 
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index e4c20d26ff1..f14f3262274 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -75,12 +75,11 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
 
 /* Use fast math functions */
 
-#define cosf(x) __cosf(((float)x))
-#define sinf(x) __sinf(((float)x))
-#define powf(x, y) __powf(((float)x), ((float)y))
-#define tanf(x) __tanf(((float)x))
-#define logf(x) __logf(((float)x))
-#define expf(x) __expf(((float)x))
+#define cosf(x) __cosf(((float)(x)))
+#define sinf(x) __sinf(((float)(x)))
+#define powf(x, y) __powf(((float)(x)), ((float)(y)))
+#define tanf(x) __tanf(((float)(x)))
+#define logf(x) __logf(((float)(x)))
+#define expf(x) __expf(((float)(x)))
 
 #endif /* __KERNEL_COMPAT_CUDA_H__ */
-
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 8346b09619e..58031a41b78 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -24,14 +24,6 @@
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
 
-#ifdef __KERNEL_OPENCL_AMD__
-#define __CL_NO_FLOAT3__
-#endif
-
-#ifdef __CL_NO_FLOAT3__
-#define float3 float4
-#endif
-
 #ifdef __CL_NOINLINE__
 #define ccl_noinline __attribute__((noinline))
 #else
@@ -68,51 +60,51 @@
 #ifdef make_int4
 #undef make_int4
 #endif
+#ifdef make_uchar4
+#undef make_uchar4
+#endif
 
 #define make_float2(x, y) ((float2)(x, y))
-#ifdef __CL_NO_FLOAT3__
-#define make_float3(x, y, z) ((float4)(x, y, z, 0.0f))
-#else
 #define make_float3(x, y, z) ((float3)(x, y, z))
-#endif
 #define make_float4(x, y, z, w) ((float4)(x, y, z, w))
 #define make_int2(x, y) ((int2)(x, y))
 #define make_int3(x, y, z) ((int3)(x, y, z))
 #define make_int4(x, y, z, w) ((int4)(x, y, z, w))
+#define make_uchar4(x, y, z, w) ((uchar4)(x, y, z, w))
 
 /* math functions */
 #define __uint_as_float(x) as_float(x)
 #define __float_as_uint(x) as_uint(x)
 #define __int_as_float(x) as_float(x)
 #define __float_as_int(x) as_int(x)
-#define powf(x, y) pow(((float)x), ((float)y))
-#define fabsf(x) fabs(((float)x))
-#define copysignf(x, y) copysign(((float)x), ((float)y))
-#define asinf(x) asin(((float)x))
-#define acosf(x) acos(((float)x))
-#define atanf(x) atan(((float)x))
-#define floorf(x) floor(((float)x))
-#define ceilf(x) ceil(((float)x))
-#define hypotf(x, y) hypot(((float)x), ((float)y))
-#define atan2f(x, y) atan2(((float)x), ((float)y))
-#define fmaxf(x, y) fmax(((float)x), ((float)y))
-#define fminf(x, y) fmin(((float)x), ((float)y))
-#define fmodf(x, y) fmod((float)x, (float)y)
+#define powf(x, y) pow(((float)(x)), ((float)(y)))
+#define fabsf(x) fabs(((float)(x)))
+#define copysignf(x, y) copysign(((float)(x)), ((float)(y)))
+#define asinf(x) asin(((float)(x)))
+#define acosf(x) acos(((float)(x)))
+#define atanf(x) atan(((float)(x)))
+#define floorf(x) floor(((float)(x)))
+#define ceilf(x) ceil(((float)(x)))
+#define hypotf(x, y) hypot(((float)(x)), ((float)(y)))
+#define atan2f(x, y) atan2(((float)(x)), ((float)(y)))
+#define fmaxf(x, y) fmax(((float)(x)), ((float)(y)))
+#define fminf(x, y) fmin(((float)(x)), ((float)(y)))
+#define fmodf(x, y) fmod((float)(x), (float)(y))
 
 #ifndef __CL_USE_NATIVE__
-#define sinf(x) native_sin(((float)x))
-#define cosf(x) native_cos(((float)x))
-#define tanf(x) native_tan(((float)x))
-#define expf(x) native_exp(((float)x))
-#define sqrtf(x) native_sqrt(((float)x))
-#define logf(x) native_log(((float)x))
+#define sinf(x) native_sin(((float)(x)))
+#define cosf(x) native_cos(((float)(x)))
+#define tanf(x) native_tan(((float)(x)))
+#define expf(x) native_exp(((float)(x)))
+#define sqrtf(x) native_sqrt(((float)(x)))
+#define logf(x) native_log(((float)(x)))
 #else
-#define sinf(x) sin(((float)x))
-#define cosf(x) cos(((float)x))
-#define tanf(x) tan(((float)x))
-#define expf(x) exp(((float)x))
-#define sqrtf(x) sqrt(((float)x))
-#define logf(x) log(((float)x))
+#define sinf(x) sin(((float)(x)))
+#define cosf(x) cos(((float)(x)))
+#define tanf(x) tan(((float)(x)))
+#define expf(x) exp(((float)(x)))
+#define sqrtf(x) sqrt(((float)(x)))
+#define logf(x) log(((float)(x)))
 #endif
 
 /* data lookup defines */
diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h
new file mode 100644
index 00000000000..bf1bc0e9db8
--- /dev/null
+++ b/intern/cycles/kernel/kernel_debug.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void debug_data_init(DebugData *debug_data)
+{
+	debug_data->num_bvh_traversal_steps = 0;
+}
+
+ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
+                                                 ccl_global float *buffer,
+                                                 PathState *state,
+                                                 DebugData *debug_data,
+                                                 int sample)
+{
+	int flag = kernel_data.film.pass_flag;
+	if(flag & PASS_BVH_TRAVERSAL_STEPS) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversal_steps,
+		                        sample,
+		                        debug_data->num_bvh_traversal_steps);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index deffa7f2ba2..4b2bb723ab6 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -63,32 +63,18 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 	return eval;
 }
 
-ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int lindex,
-	float randt, float randu, float randv, Ray *ray, BsdfEval *eval,
-	bool *is_lamp, int bounce, int transparent_bounce)
+ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
+	LightSample *ls, Ray *ray, BsdfEval *eval, bool *is_lamp,
+	int bounce, int transparent_bounce)
 {
-	LightSample ls;
-
-#ifdef __BRANCHED_PATH__
-	if(lindex != LAMP_NONE) {
-		/* sample position on a specified light */
-		light_select(kg, lindex, randu, randv, sd->P, &ls);
-	}
-	else
-#endif
-	{
-		/* sample a light and position on int */
-		light_sample(kg, randt, randu, randv, sd->time, sd->P, &ls);
-	}
-
-	if(ls.pdf == 0.0f)
+	if(ls->pdf == 0.0f)
 		return false;
 
 	/* todo: implement */
 	differential3 dD = differential3_zero();
 
 	/* evaluate closure */
-	float3 light_eval = direct_emissive_eval(kg, &ls, -ls.D, dD, ls.t, sd->time, bounce, transparent_bounce);
+	float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, sd->time, bounce, transparent_bounce);
 
 	if(is_zero(light_eval))
 		return false;
@@ -98,49 +84,51 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int
 
 #ifdef __VOLUME__
 	if(sd->prim != PRIM_NONE)
-		shader_bsdf_eval(kg, sd, ls.D, eval, &bsdf_pdf);
+		shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf);
 	else
-		shader_volume_phase_eval(kg, sd, ls.D, eval, &bsdf_pdf);
+		shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf);
 #else
-	shader_bsdf_eval(kg, sd, ls.D, eval, &bsdf_pdf);
+	shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf);
 #endif
 
-	if(ls.shader & SHADER_USE_MIS) {
+	if(ls->shader & SHADER_USE_MIS) {
 		/* multiple importance sampling */
-		float mis_weight = power_heuristic(ls.pdf, bsdf_pdf);
+		float mis_weight = power_heuristic(ls->pdf, bsdf_pdf);
 		light_eval *= mis_weight;
 	}
 	
-	bsdf_eval_mul(eval, light_eval/ls.pdf);
+	bsdf_eval_mul(eval, light_eval/ls->pdf);
 
 #ifdef __PASSES__
 	/* use visibility flag to skip lights */
-	if(ls.shader & SHADER_EXCLUDE_ANY) {
-		if(ls.shader & SHADER_EXCLUDE_DIFFUSE)
+	if(ls->shader & SHADER_EXCLUDE_ANY) {
+		if(ls->shader & SHADER_EXCLUDE_DIFFUSE)
 			eval->diffuse = make_float3(0.0f, 0.0f, 0.0f);
-		if(ls.shader & SHADER_EXCLUDE_GLOSSY)
+		if(ls->shader & SHADER_EXCLUDE_GLOSSY)
 			eval->glossy = make_float3(0.0f, 0.0f, 0.0f);
-		if(ls.shader & SHADER_EXCLUDE_TRANSMIT)
+		if(ls->shader & SHADER_EXCLUDE_TRANSMIT)
 			eval->transmission = make_float3(0.0f, 0.0f, 0.0f);
+		if(ls->shader & SHADER_EXCLUDE_SCATTER)
+			eval->scatter = make_float3(0.0f, 0.0f, 0.0f);
 	}
 #endif
 
 	if(bsdf_eval_is_zero(eval))
 		return false;
 
-	if(ls.shader & SHADER_CAST_SHADOW) {
+	if(ls->shader & SHADER_CAST_SHADOW) {
 		/* setup ray */
-		bool transmit = (dot(sd->Ng, ls.D) < 0.0f);
+		bool transmit = (dot(sd->Ng, ls->D) < 0.0f);
 		ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng);
 
-		if(ls.t == FLT_MAX) {
+		if(ls->t == FLT_MAX) {
 			/* distant light */
-			ray->D = ls.D;
-			ray->t = ls.t;
+			ray->D = ls->D;
+			ray->t = ls->t;
 		}
 		else {
 			/* other lights, avoid self-intersection */
-			ray->D = ray_offset(ls.P, ls.Ng) - ray->P;
+			ray->D = ray_offset(ls->P, ls->Ng) - ray->P;
 			ray->D = normalize_len(ray->D, &ray->t);
 		}
 
@@ -153,7 +141,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, int
 	}
 
 	/* return if it's a lamp for shadow pass */
-	*is_lamp = (ls.prim == PRIM_NONE && ls.type != LIGHT_BACKGROUND);
+	*is_lamp = (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND);
 
 	return true;
 }
@@ -201,13 +189,25 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
 		if(ls.shader & SHADER_EXCLUDE_ANY) {
 			if(((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
 			   ((ls.shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) ||
-			   ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)))
+			   ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
+			   ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
 				continue;
 		}
 #endif
 
 		float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce);
 
+#ifdef __VOLUME__
+		if(state->volume_stack[0].shader != SHADER_NONE) {
+			/* shadow attenuation */
+			Ray volume_ray = *ray;
+			volume_ray.t = ls.t;
+			float3 volume_tp = make_float3(1.0f, 1.0f, 1.0f);
+			kernel_volume_shadow(kg, state, &volume_ray, &volume_tp);
+			L *= volume_tp;
+		}
+#endif
+
 		if(!(state->flag & PATH_RAY_MIS_SKIP)) {
 			/* multiple importance sampling, get regular light pdf,
 			 * and compute weight with respect to BSDF pdf */
@@ -234,7 +234,8 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta
 		if(((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
 		   ((shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) ||
 		   ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
-		   ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)))
+		   ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) ||
+		   ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
 			return make_float3(0.0f, 0.0f, 0.0f);
 	}
 
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index 7a850844bf2..2a5b7689e57 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -14,6 +14,8 @@
  * limitations under the License
  */
 
+/* TODO(sergey): Consider moving portable ctz/clz stuff to util. */
+
 CCL_NAMESPACE_BEGIN
 
 /* "Correlated Multi-Jittered Sampling"
@@ -35,8 +37,16 @@ ccl_device_inline int cmj_fast_mod_pow2(int a, int b)
 /* a must be > 0 and b must be > 1 */
 ccl_device_inline int cmj_fast_div_pow2(int a, int b)
 {
-#if defined(__KERNEL_SSE2__) && !defined(_MSC_VER)
+	kernel_assert(a > 0);
+	kernel_assert(b > 1);
+#if defined(__KERNEL_SSE2__)
+#  ifdef _MSC_VER
+	unsigned long ctz;
+	_BitScanForward(&ctz, b);
+	return a >> ctz;
+#  else
 	return a >> __builtin_ctz(b);
+#  endif
 #else
 	return a/b;
 #endif
@@ -44,8 +54,15 @@ ccl_device_inline int cmj_fast_div_pow2(int a, int b)
 
 ccl_device_inline uint cmj_w_mask(uint w)
 {
-#if defined(__KERNEL_SSE2__) && !defined(_MSC_VER)
+	kernel_assert(w > 1);
+#if defined(__KERNEL_SSE2__)
+#  ifdef _MSC_VER
+	unsigned long leading_zero;
+	_BitScanReverse(&leading_zero, w);
+	return ((1 << (1 + leading_zero)) - 1);
+#  else
 	return ((1 << (32 - __builtin_clz(w))) - 1);
+#  endif
 #else
 	w |= w >> 1;
 	w |= w >> 2;
@@ -165,7 +182,8 @@ ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
 		smodm = cmj_fast_mod_pow2(s, m);
 	}
 	else {
-		sdivm = float_to_int(s * invm);
+		/* Doing s*inmv gives precision issues here. */
+		sdivm = s / m;
 		smodm = s - sdivm*m;
 	}
 
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index ac432d3fe04..b18f67ad524 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -27,7 +27,7 @@ typedef struct LightSample {
 	float pdf;			/* light sampling probability density function */
 	float eval_fac;		/* intensity multiplier */
 	int object;			/* object id for triangle/curve lights */
-	int prim;			/* primitive id for triangle/curve ligths */
+	int prim;			/* primitive id for triangle/curve lights */
 	int shader;			/* shader id */
 	int lamp;			/* lamp id */
 	LightType type;		/* type of light */
@@ -167,12 +167,137 @@ ccl_device float3 sphere_light_sample(float3 P, float3 center, float radius, flo
 	return disk_light_sample(normalize(P - center), randu, randv)*radius;
 }
 
-ccl_device float3 area_light_sample(float3 axisu, float3 axisv, float randu, float randv)
+/* Uses the following paper:
+ *
+ * Carlos Urena et al.
+ * An Area-Preserving Parametrization for Spherical Rectangles.
+ *
+ * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf
+ */
+ccl_device float3 area_light_sample(float3 P,
+                                    float3 light_p,
+                                    float3 axisu, float3 axisv,
+                                    float randu, float randv,
+                                    float *pdf)
 {
-	randu = randu - 0.5f;
-	randv = randv - 0.5f;
+	/* In our name system we're using P for the center,
+	 * which is o in the paper.
+	 */
+
+	float3 corner = light_p - axisu * 0.5f - axisv * 0.5f;
+	float axisu_len, axisv_len;
+	/* Compute local reference system R. */
+	float3 x = normalize_len(axisu, &axisu_len);
+	float3 y = normalize_len(axisv, &axisv_len);
+	float3 z = cross(x, y);
+	/* Compute rectangle coords in local reference system. */
+	float3 dir = corner - P;
+	float z0 = dot(dir, z);
+	/* Flip 'z' to make it point against Q. */
+	if(z0 > 0.0f) {
+		z *= -1.0f;
+		z0 *= -1.0f;
+	}
+	float z0sq = z0 * z0;
+	float x0 = dot(dir, x);
+	float y0 = dot(dir, y);
+	float x1 = x0 + axisu_len;
+	float y1 = y0 + axisv_len;
+	float y0sq = y0 * y0;
+	float y1sq = y1 * y1;
+	/* Create vectors to four vertices. */
+	float3 v00 = make_float3(x0, y0, z0);
+	float3 v01 = make_float3(x0, y1, z0);
+	float3 v10 = make_float3(x1, y0, z0);
+	float3 v11 = make_float3(x1, y1, z0);
+	/* Compute normals to edges. */
+	float3 n0 = normalize(cross(v00, v10));
+	float3 n1 = normalize(cross(v10, v11));
+	float3 n2 = normalize(cross(v11, v01));
+	float3 n3 = normalize(cross(v01, v00));
+	/* Compute internal angles (gamma_i). */
+	float g0 = acosf(-dot(n0, n1));
+	float g1 = acosf(-dot(n1, n2));
+	float g2 = acosf(-dot(n2, n3));
+	float g3 = acosf(-dot(n3, n0));
+	/* Compute predefined constants. */
+	float b0 = n0.z;
+	float b1 = n2.z;
+	float b0sq = b0 * b0;
+	float k = M_2PI_F - g2 - g3;
+	/* Compute solid angle from internal angles. */
+	float S = g0 + g1 - k;
+
+	/* Compute cu. */
+	float au = randu * S + k;
+	float fu = (cosf(au) * b0 - b1) / sinf(au);
+	float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
+	cu = clamp(cu, -1.0f, 1.0f);
+	/* Compute xu. */
+	float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+	xu = clamp(xu, x0, x1);
+	/* Compute yv. */
+	float d = sqrtf(xu * xu + z0sq);
+	float h0 = y0 / sqrtf(d * d + y0sq);
+	float h1 = y1 / sqrtf(d * d + y1sq);
+	float hv = h0 + randv * (h1 - h0), hv2 = hv * hv;
+	float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1;
+
+	*pdf = 1.0f / S;
+
+	/* Transform (xu, yv, z0) to world coords. */
+	return P + xu * x + yv * y + z0 * z;
+}
 
-	return axisu*randu + axisv*randv;
+/* TODO(sergey): This is actually a duplicated code from above, but how to avoid
+ * this without having some nasty function with loads of parameters?
+ */
+ccl_device float area_light_pdf(float3 P,
+                                float3 light_p,
+                                float3 axisu, float3 axisv)
+{
+	/* In our name system we're using P for the center,
+	 * which is o in the paper.
+	 */
+
+	float3 corner = light_p - axisu * 0.5f - axisv * 0.5f;
+	float axisu_len, axisv_len;
+	/* Compute local reference system R. */
+	float3 x = normalize_len(axisu, &axisu_len);
+	float3 y = normalize_len(axisv, &axisv_len);
+	float3 z = cross(x, y);
+	/* Compute rectangle coords in local reference system. */
+	float3 dir = corner - P;
+	float z0 = dot(dir, z);
+	/* Flip 'z' to make it point against Q. */
+	if(z0 > 0.0f) {
+		z *= -1.0f;
+		z0 *= -1.0f;
+	}
+	float x0 = dot(dir, x);
+	float y0 = dot(dir, y);
+	float x1 = x0 + axisu_len;
+	float y1 = y0 + axisv_len;
+	/* Create vectors to four vertices. */
+	float3 v00 = make_float3(x0, y0, z0);
+	float3 v01 = make_float3(x0, y1, z0);
+	float3 v10 = make_float3(x1, y0, z0);
+	float3 v11 = make_float3(x1, y1, z0);
+	/* Compute normals to edges. */
+	float3 n0 = normalize(cross(v00, v10));
+	float3 n1 = normalize(cross(v10, v11));
+	float3 n2 = normalize(cross(v11, v01));
+	float3 n3 = normalize(cross(v01, v00));
+	/* Compute internal angles (gamma_i). */
+	float g0 = acosf(-dot(n0, n1));
+	float g1 = acosf(-dot(n1, n2));
+	float g2 = acosf(-dot(n2, n3));
+	float g3 = acosf(-dot(n3, n0));
+	/* Compute predefined constants. */
+	float k = M_2PI_F - g2 - g3;
+	/* Compute solid angle from internal angles. */
+	float S = g0 + g1 - k;
+    return 1.0f / S;
 }
 
 ccl_device float spot_light_attenuation(float4 data1, float4 data2, LightSample *ls)
@@ -276,6 +401,7 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 				float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2);
 				ls->eval_fac *= spot_light_attenuation(data1, data2, ls);
 			}
+			ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 		}
 		else {
 			/* area light */
@@ -286,18 +412,22 @@ ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
 			float3 axisv = make_float3(data2.y, data2.z, data2.w);
 			float3 D = make_float3(data3.y, data3.z, data3.w);
 
-			ls->P += area_light_sample(axisu, axisv, randu, randv);
+			ls->P = area_light_sample(P, ls->P,
+			                          axisu, axisv,
+			                          randu, randv,
+			                          &ls->pdf);
+
 			ls->Ng = D;
 			ls->D = normalize_len(ls->P - P, &ls->t);
 
 			float invarea = data2.x;
-
 			ls->eval_fac = 0.25f*invarea;
-			ls->pdf = invarea;
+
+			if(dot(ls->D, D) > 0.0f)
+				ls->pdf = 0.0f;
 		}
 
 		ls->eval_fac *= kernel_data.integrator.inv_pdf_lights;
-		ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 	}
 }
 
@@ -355,8 +485,12 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 		ls->D = D;
 		ls->t = FLT_MAX;
 
+		/* compute pdf */
 		float invarea = data1.w;
 		ls->pdf = invarea/(costheta*costheta*costheta);
+		if(ls->t != FLT_MAX)
+			ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
+
 		ls->eval_fac = ls->pdf;
 	}
 	else if(type == LIGHT_POINT || type == LIGHT_SPOT) {
@@ -386,6 +520,10 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 			if(ls->eval_fac == 0.0f)
 				return false;
 		}
+
+		/* compute pdf */
+		if(ls->t != FLT_MAX)
+			ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 	}
 	else if(type == LIGHT_AREA) {
 		/* area light */
@@ -412,16 +550,12 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 
 		ls->D = D;
 		ls->Ng = Ng;
-		ls->pdf = invarea;
-		ls->eval_fac = 0.25f*ls->pdf;
+		ls->pdf = area_light_pdf(P, ls->P, axisu, axisv);
+		ls->eval_fac = 0.25f*invarea;
 	}
 	else
 		return false;
 
-	/* compute pdf */
-	if(ls->t != FLT_MAX)
-		ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
-
 	return true;
 }
 
@@ -457,7 +591,7 @@ ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object,
 	v = randv*randu;
 
 	/* triangle, so get position, normal, shader */
-	triangle_point_normal(kg, prim, u, v, &ls->P, &ls->Ng, &ls->shader);
+	triangle_point_normal(kg, object, prim, u, v, &ls->P, &ls->Ng, &ls->shader);
 	ls->object = object;
 	ls->prim = prim;
 	ls->lamp = LAMP_NONE;
@@ -546,11 +680,6 @@ ccl_device int light_select_num_samples(KernelGlobals *kg, int index)
 	return __float_as_int(data3.x);
 }
 
-ccl_device void light_select(KernelGlobals *kg, int index, float randu, float randv, float3 P, LightSample *ls)
-{
-	lamp_light_sample(kg, index, randu, randv, P, ls);
-}
-
 ccl_device int lamp_light_eval_sample(KernelGlobals *kg, float randt)
 {
 	/* sample index */
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index a80a0033712..c03229f0a3a 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -29,7 +29,6 @@
 #include "kernel_accumulate.h"
 #include "kernel_shader.h"
 #include "kernel_light.h"
-#include "kernel_emission.h"
 #include "kernel_passes.h"
 
 #ifdef __SUBSURFACE__
@@ -42,177 +41,15 @@
 
 #include "kernel_path_state.h"
 #include "kernel_shadow.h"
+#include "kernel_emission.h"
+#include "kernel_path_surface.h"
+#include "kernel_path_volume.h"
 
-CCL_NAMESPACE_BEGIN
-
-#ifdef __VOLUME__
-
-ccl_device_inline bool kernel_path_integrate_scatter_lighting(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray,
-	float num_samples_adjust)
-{
-#ifdef __EMISSION__
-	if(kernel_data.integrator.use_direct_light) {
-		/* sample illumination from lights to find path contribution */
-		if(sd->flag & SD_BSDF_HAS_EVAL) {
-			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
-			float light_u, light_v;
-			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-			Ray light_ray;
-			BsdfEval L_light;
-			bool is_lamp;
-
-#ifdef __OBJECT_MOTION__
-			light_ray.time = sd->time;
-#endif
-
-			if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
-				/* trace shadow ray */
-				float3 shadow;
-
-				if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
-					/* accumulate */
-					path_radiance_accum_light(L, *throughput * num_samples_adjust, &L_light, shadow, 1.0f, state->bounce, is_lamp);
-				}
-			}
-		}
-	}
-#endif
-
-	/* sample phase function */
-	float phase_pdf;
-	BsdfEval phase_eval;
-	float3 phase_omega_in;
-	differential3 phase_domega_in;
-	float phase_u, phase_v;
-	path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v);
-	int label;
-
-	label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval,
-		&phase_omega_in, &phase_domega_in, &phase_pdf);
-
-	if(phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval))
-		return false;
-	
-	/* modify throughput */
-	path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label);
-
-	/* set labels */
-	state->ray_pdf = phase_pdf;
-#ifdef __LAMP_MIS__
-	state->ray_t = 0.0f;
-#endif
-	state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf);
-
-	/* update path state */
-	path_state_next(kg, state, label);
-
-	/* setup ray */
-	ray->P = sd->P;
-	ray->D = phase_omega_in;
-	ray->t = FLT_MAX;
-
-#ifdef __RAY_DIFFERENTIALS__
-	ray->dP = sd->dP;
-	ray->dD = phase_domega_in;
-#endif
-
-	return true;
-}
-
+#ifdef __KERNEL_DEBUG__
+#include "kernel_debug.h"
 #endif
 
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__)
-
-ccl_device void kernel_branched_path_integrate_direct_lighting(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, PathState *state, float3 throughput, float num_samples_adjust, PathRadiance *L, bool sample_all_lights)
-{
-	/* sample illumination from lights to find path contribution */
-	if(sd->flag & SD_BSDF_HAS_EVAL) {
-		Ray light_ray;
-		BsdfEval L_light;
-		bool is_lamp;
-
-#ifdef __OBJECT_MOTION__
-		light_ray.time = sd->time;
-#endif
-
-		if(sample_all_lights) {
-			/* lamp sampling */
-			for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
-				int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
-				float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
-				RNG lamp_rng = cmj_hash(*rng, i);
-
-				if(kernel_data.integrator.pdf_triangles != 0.0f)
-					num_samples_inv *= 0.5f;
-
-				for(int j = 0; j < num_samples; j++) {
-					float light_u, light_v;
-					path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-
-					if(direct_emission(kg, sd, i, 0.0f, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
-						/* trace shadow ray */
-						float3 shadow;
-
-						if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
-							/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
-						}
-					}
-				}
-			}
-
-			/* mesh light sampling */
-			if(kernel_data.integrator.pdf_triangles != 0.0f) {
-				int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
-				float num_samples_inv = num_samples_adjust/num_samples;
-
-				if(kernel_data.integrator.num_all_lights)
-					num_samples_inv *= 0.5f;
-
-				for(int j = 0; j < num_samples; j++) {
-					float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
-					float light_u, light_v;
-					path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-
-					/* only sample triangle lights */
-					if(kernel_data.integrator.num_all_lights)
-						light_t = 0.5f*light_t;
-
-					if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
-						/* trace shadow ray */
-						float3 shadow;
-
-						if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
-							/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
-						}
-					}
-				}
-			}
-		}
-		else {
-			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
-			float light_u, light_v;
-			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-			/* sample random light */
-			if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
-				/* trace shadow ray */
-				float3 shadow;
-
-				if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
-					/* accumulate */
-					path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
-				}
-			}
-		}
-	}
-}
-
-#endif
+CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 	float3 throughput, int num_samples, PathState state, PathRadiance *L)
@@ -222,11 +59,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 		/* intersect scene */
 		Intersection isect;
 		uint visibility = path_state_ray_visibility(kg, &state);
-#ifdef __HAIR__
 		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#else
-		bool hit = scene_intersect(kg, &ray, visibility, &isect);
-#endif
 
 #ifdef __LAMP_MIS__
 		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
@@ -255,15 +88,81 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 			Ray volume_ray = ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
 
-			ShaderData volume_sd;
-			VolumeIntegrateResult result = kernel_volume_integrate(kg, &state,
-				&volume_sd, &volume_ray, L, &throughput, rng);
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
 
-			if(result == VOLUME_PATH_SCATTERED) {
-				if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &throughput, &state, L, &ray, 1.0f))
-					continue;
-				else
-					break;
+#ifdef __VOLUME_DECOUPLED__
+			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method);
+
+			if(decoupled) {
+				/* cache steps along volume for repeated sampling */
+				VolumeSegment volume_segment;
+				ShaderData volume_sd;
+
+				shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+				kernel_volume_decoupled_record(kg, &state,
+					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+				
+				volume_segment.sampling_method = sampling_method;
+
+				/* emission */
+				if(volume_segment.closure_flag & SD_EMISSION)
+					path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
+
+				/* scattering */
+				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+				if(volume_segment.closure_flag & SD_SCATTER) {
+					bool all = kernel_data.integrator.sample_all_lights_indirect;
+
+					/* direct light sampling */
+					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+						throughput, &state, L, 1.0f, all, &volume_ray, &volume_segment);
+
+					/* indirect sample. if we use distance sampling and take just
+					 * one sample for direct and indirect light, we could share
+					 * this computation, but makes code a bit complex */
+					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+
+					result = kernel_volume_decoupled_scatter(kg,
+						&state, &volume_ray, &volume_sd, &throughput,
+						rphase, rscatter, &volume_segment, NULL, true);
+				}
+
+				if(result != VOLUME_PATH_SCATTERED)
+					throughput *= volume_segment.accum_transmittance;
+
+				/* free cached steps */
+				kernel_volume_decoupled_free(kg, &volume_segment);
+
+				if(result == VOLUME_PATH_SCATTERED) {
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray))
+						continue;
+					else
+						break;
+				}
+			}
+			else
+#endif
+			{
+				/* integrate along volume segment with distance sampling */
+				ShaderData volume_sd;
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, &state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous);
+
+#ifdef __VOLUME_SCATTER__
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* direct lighting */
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, L);
+
+					/* indirect light bounce */
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray))
+						continue;
+					else
+						break;
+				}
+#endif
 			}
 		}
 #endif
@@ -281,7 +180,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 		/* setup shading */
 		ShaderData sd;
 		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
-		float rbsdf = path_state_rng_1D(kg, rng, &state, PRNG_BSDF);
+		float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
 		shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_INDIRECT);
 #ifdef __BRANCHED_PATH__
 		shader_merge_closures(&sd);
@@ -315,7 +214,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 			break;
 		}
 		else if(probability != 1.0f) {
-			float terminate = path_state_rng_1D(kg, rng, &state, PRNG_TERMINATE);
+			float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
 
 			if(terminate >= probability)
 				break;
@@ -383,187 +282,12 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 #if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
 		if(kernel_data.integrator.use_direct_light) {
 			bool all = kernel_data.integrator.sample_all_lights_indirect;
-			kernel_branched_path_integrate_direct_lighting(kg, rng, &sd, &state, throughput, 1.0f, L, all);
-		}
-#endif
-
-		/* no BSDF? we can stop here */
-		if(sd.flag & SD_BSDF) {
-			/* sample BSDF */
-			float bsdf_pdf;
-			BsdfEval bsdf_eval;
-			float3 bsdf_omega_in;
-			differential3 bsdf_domega_in;
-			float bsdf_u, bsdf_v;
-			path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-			int label;
-
-			label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval,
-				&bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
-			if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
-				break;
-
-			/* modify throughput */
-			path_radiance_bsdf_bounce(L, &throughput, &bsdf_eval, bsdf_pdf, state.bounce, label);
-
-			/* set labels */
-			if(!(label & LABEL_TRANSPARENT)) {
-				state.ray_pdf = bsdf_pdf;
-#ifdef __LAMP_MIS__
-				state.ray_t = 0.0f;
-#endif
-				state.min_ray_pdf = fminf(bsdf_pdf, state.min_ray_pdf);
-			}
-
-			/* update path state */
-			path_state_next(kg, &state, label);
-
-			/* setup ray */
-			ray.P = ray_offset(sd.P, (label & LABEL_TRANSMIT)? -sd.Ng: sd.Ng);
-			ray.D = bsdf_omega_in;
-			ray.t = FLT_MAX;
-#ifdef __RAY_DIFFERENTIALS__
-			ray.dP = sd.dP;
-			ray.dD = bsdf_domega_in;
-#endif
-
-#ifdef __VOLUME__
-			/* enter/exit volume */
-			if(label & LABEL_TRANSMIT)
-				kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
-#endif
+			kernel_branched_path_surface_connect_light(kg, rng, &sd, &state, throughput, 1.0f, L, all);
 		}
-#ifdef __VOLUME__
-		else if(sd.flag & SD_HAS_ONLY_VOLUME) {
-			/* no surface shader but have a volume shader? act transparent */
-
-			/* update path state, count as transparent */
-			path_state_next(kg, &state, LABEL_TRANSPARENT);
-
-			/* setup ray position, direction stays unchanged */
-			ray.P = ray_offset(sd.P, -sd.Ng);
-#ifdef __RAY_DIFFERENTIALS__
-			ray.dP = sd.dP;
 #endif
 
-			/* enter/exit volume */
-			kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
-		}
-#endif
-		else {
-			/* no bsdf or volume? we're done */
+		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
 			break;
-		}
-	}
-}
-
-ccl_device_inline bool kernel_path_integrate_lighting(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
-{
-#ifdef __EMISSION__
-	if(kernel_data.integrator.use_direct_light) {
-		/* sample illumination from lights to find path contribution */
-		if(sd->flag & SD_BSDF_HAS_EVAL) {
-			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
-			float light_u, light_v;
-			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-			Ray light_ray;
-			BsdfEval L_light;
-			bool is_lamp;
-
-#ifdef __OBJECT_MOTION__
-			light_ray.time = sd->time;
-#endif
-
-			if(direct_emission(kg, sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
-				/* trace shadow ray */
-				float3 shadow;
-
-				if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
-					/* accumulate */
-					path_radiance_accum_light(L, *throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
-				}
-			}
-		}
-	}
-#endif
-
-	/* no BSDF? we can stop here */
-	if(sd->flag & SD_BSDF) {
-		/* sample BSDF */
-		float bsdf_pdf;
-		BsdfEval bsdf_eval;
-		float3 bsdf_omega_in;
-		differential3 bsdf_domega_in;
-		float bsdf_u, bsdf_v;
-		path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-		int label;
-
-		label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval,
-			&bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
-		if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
-			return false;
-
-		/* modify throughput */
-		path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
-
-		/* set labels */
-		if(!(label & LABEL_TRANSPARENT)) {
-			state->ray_pdf = bsdf_pdf;
-#ifdef __LAMP_MIS__
-			state->ray_t = 0.0f;
-#endif
-			state->min_ray_pdf = fminf(bsdf_pdf, state->min_ray_pdf);
-		}
-
-		/* update path state */
-		path_state_next(kg, state, label);
-
-		/* setup ray */
-		ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
-		ray->D = bsdf_omega_in;
-
-		if(state->bounce == 0)
-			ray->t -= sd->ray_length; /* clipping works through transparent */
-		else
-			ray->t = FLT_MAX;
-
-#ifdef __RAY_DIFFERENTIALS__
-		ray->dP = sd->dP;
-		ray->dD = bsdf_domega_in;
-#endif
-
-#ifdef __VOLUME__
-		/* enter/exit volume */
-		if(label & LABEL_TRANSMIT)
-			kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#endif
-		return true;
-	}
-#ifdef __VOLUME__
-	else if(sd->flag & SD_HAS_ONLY_VOLUME) {
-		/* no surface shader but have a volume shader? act transparent */
-
-		/* update path state, count as transparent */
-		path_state_next(kg, state, LABEL_TRANSPARENT);
-
-		/* setup ray position, direction stays unchanged */
-		ray->P = ray_offset(sd->P, -sd->Ng);
-#ifdef __RAY_DIFFERENTIALS__
-		ray->dP = sd->dP;
-#endif
-
-		/* enter/exit volume */
-		kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-		return true;
-	}
-#endif
-	else {
-		/* no bsdf or volume? */
-		return false;
 	}
 }
 
@@ -601,7 +325,68 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *
 	}
 }
 
+ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, float3 throughput)
+{
+	int num_samples = kernel_data.integrator.ao_samples;
+	float num_samples_inv = 1.0f/num_samples;
+	float ao_factor = kernel_data.background.ao_factor;
+	float3 ao_N;
+	float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+	float3 ao_alpha = shader_bsdf_alpha(kg, sd);
+
+	for(int j = 0; j < num_samples; j++) {
+		float bsdf_u, bsdf_v;
+		path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+		float3 ao_D;
+		float ao_pdf;
+
+		sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+		if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+			Ray light_ray;
+			float3 ao_shadow;
+
+			light_ray.P = ray_offset(sd->P, sd->Ng);
+			light_ray.D = ao_D;
+			light_ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+			light_ray.time = sd->time;
+#endif
+			light_ray.dP = sd->dP;
+			light_ray.dD = differential3_zero();
+
+			if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
+				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+		}
+	}
+}
+
 #ifdef __SUBSURFACE__
+
+#ifdef __VOLUME__
+ccl_device void kernel_path_subsurface_update_volume_stack(KernelGlobals *kg,
+                                                           Ray *ray,
+                                                           VolumeStack *stack)
+{
+	kernel_assert(kernel_data.integrator.use_volumes);
+
+	Ray volume_ray = *ray;
+	Intersection isect;
+
+	while(scene_intersect_volume(kg, &volume_ray, &isect))
+	{
+		ShaderData sd;
+		shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
+		kernel_volume_stack_enter_exit(kg, &sd, stack);
+
+		/* Move ray forward. */
+		volume_ray.P = ray_offset(sd.P, -sd.Ng);
+		volume_ray.t -= sd.ray_length;
+	}
+}
+#endif
+
 ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, Ray *ray, float3 *throughput)
 {
 	float bssrdf_probability;
@@ -618,6 +403,11 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 		float bssrdf_u, bssrdf_v;
 		path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
 		int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
+#ifdef __VOLUME__
+		Ray volume_ray = *ray;
+		bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
+		                                sd->flag & SD_OBJECT_INTERSECTS_VOLUME;
+#endif
 
 		/* compute lighting with the BSDF closure */
 		for(int hit = 0; hit < num_hits; hit++) {
@@ -627,12 +417,30 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
 
 			hit_state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
 			hit_state.rng_offset += PRNG_BOUNCE_NUM;
+			
+			kernel_path_surface_connect_light(kg, rng, &bssrdf_sd[hit], tp, state, L);
 
-			if(kernel_path_integrate_lighting(kg, rng, &bssrdf_sd[hit], &tp, &hit_state, L, &hit_ray)) {
+			if(kernel_path_surface_bounce(kg, rng, &bssrdf_sd[hit], &tp, &hit_state, L, &hit_ray)) {
 #ifdef __LAMP_MIS__
 				hit_state.ray_t = 0.0f;
 #endif
 
+#ifdef __VOLUME__
+				if(need_update_volume_stack) {
+					/* Setup ray from previous surface point to the new one. */
+					volume_ray.D = normalize_len(hit_ray.P - volume_ray.P,
+					                             &volume_ray.t);
+
+					kernel_path_subsurface_update_volume_stack(
+					    kg,
+					    &volume_ray,
+					    hit_state.volume_stack);
+
+					/* Move volume ray forward. */
+					volume_ray.P = hit_ray.P;
+				}
+#endif
+
 				kernel_path_indirect(kg, rng, hit_ray, tp, state->num_samples, hit_state, L);
 
 				/* for render passes, sum and reset indirect light pass variables
@@ -657,7 +465,12 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 	path_radiance_init(&L, kernel_data.film.use_light_pass);
 
 	PathState state;
-	path_state_init(kg, &state, rng, sample);
+	path_state_init(kg, &state, rng, sample, &ray);
+
+#ifdef __KERNEL_DEBUG__
+	DebugData debug_data;
+	debug_data_init(&debug_data);
+#endif
 
 	/* path iteration */
 	for(;;) {
@@ -682,7 +495,13 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 
 		bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
-		bool hit = scene_intersect(kg, &ray, visibility, &isect);
+		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+		if(state.flag & PATH_RAY_CAMERA) {
+			debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
+		}
 #endif
 
 #ifdef __LAMP_MIS__
@@ -712,15 +531,81 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 			Ray volume_ray = ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
 
-			ShaderData volume_sd;
-			VolumeIntegrateResult result = kernel_volume_integrate(kg, &state,
-				&volume_sd, &volume_ray, &L, &throughput, rng);
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
 
-			if(result == VOLUME_PATH_SCATTERED) {
-				if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &throughput, &state, &L, &ray, 1.0f))
-					continue;
-				else
-					break;
+#ifdef __VOLUME_DECOUPLED__
+			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
+
+			if(decoupled) {
+				/* cache steps along volume for repeated sampling */
+				VolumeSegment volume_segment;
+				ShaderData volume_sd;
+
+				shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+				kernel_volume_decoupled_record(kg, &state,
+					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+				volume_segment.sampling_method = sampling_method;
+
+				/* emission */
+				if(volume_segment.closure_flag & SD_EMISSION)
+					path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+
+				/* scattering */
+				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+				if(volume_segment.closure_flag & SD_SCATTER) {
+					bool all = false;
+
+					/* direct light sampling */
+					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+						throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
+
+					/* indirect sample. if we use distance sampling and take just
+					 * one sample for direct and indirect light, we could share
+					 * this computation, but makes code a bit complex */
+					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+
+					result = kernel_volume_decoupled_scatter(kg,
+						&state, &volume_ray, &volume_sd, &throughput,
+						rphase, rscatter, &volume_segment, NULL, true);
+				}
+
+				if(result != VOLUME_PATH_SCATTERED)
+					throughput *= volume_segment.accum_transmittance;
+
+				/* free cached steps */
+				kernel_volume_decoupled_free(kg, &volume_segment);
+
+				if(result == VOLUME_PATH_SCATTERED) {
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+						continue;
+					else
+						break;
+				}
+			}
+			else 
+#endif
+			{
+				/* integrate along volume segment with distance sampling */
+				ShaderData volume_sd;
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+
+#ifdef __VOLUME_SCATTER__
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* direct lighting */
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L);
+
+					/* indirect light bounce */
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+						continue;
+					else
+						break;
+				}
+#endif
 			}
 		}
 #endif
@@ -748,7 +633,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 		/* setup shading */
 		ShaderData sd;
 		shader_setup_from_ray(kg, &sd, &isect, &ray, state.bounce, state.transparent_bounce);
-		float rbsdf = path_state_rng_1D(kg, rng, &state, PRNG_BSDF);
+		float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
 		shader_eval_surface(kg, &sd, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
 
 		/* holdout */
@@ -803,7 +688,7 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 			break;
 		}
 		else if(probability != 1.0f) {
-			float terminate = path_state_rng_1D(kg, rng, &state, PRNG_TERMINATE);
+			float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
 
 			if(terminate >= probability)
 				break;
@@ -826,134 +711,33 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 				break;
 		}
 #endif
-		
-		/* Same as kernel_path_integrate_lighting(kg, rng, &sd, &throughput, &state, &L, &ray),
-		   but for CUDA the function call is slower. */
-#ifdef __EMISSION__
-		if(kernel_data.integrator.use_direct_light) {
-			/* sample illumination from lights to find path contribution */
-			if(sd.flag & SD_BSDF_HAS_EVAL) {
-				float light_t = path_state_rng_1D(kg, rng, &state, PRNG_LIGHT);
-				float light_u, light_v;
-				path_state_rng_2D(kg, rng, &state, PRNG_LIGHT_U, &light_u, &light_v);
-
-				Ray light_ray;
-				BsdfEval L_light;
-				bool is_lamp;
-
-#ifdef __OBJECT_MOTION__
-				light_ray.time = sd.time;
-#endif
-
-				if(direct_emission(kg, &sd, LAMP_NONE, light_t, light_u, light_v, &light_ray, &L_light, &is_lamp, state.bounce, state.transparent_bounce)) {
-					/* trace shadow ray */
-					float3 shadow;
-
-					if(!shadow_blocked(kg, &state, &light_ray, &shadow)) {
-						/* accumulate */
-						path_radiance_accum_light(&L, throughput, &L_light, shadow, 1.0f, state.bounce, is_lamp);
-					}
-				}
-			}
-		}
-#endif
-
-		if(sd.flag & SD_BSDF) {
-			/* sample BSDF */
-			float bsdf_pdf;
-			BsdfEval bsdf_eval;
-			float3 bsdf_omega_in;
-			differential3 bsdf_domega_in;
-			float bsdf_u, bsdf_v;
-			path_state_rng_2D(kg, rng, &state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-			int label;
-
-			label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval,
-				&bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
-			if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
-				break;
-
-			/* modify throughput */
-			path_radiance_bsdf_bounce(&L, &throughput, &bsdf_eval, bsdf_pdf, state.bounce, label);
-
-			/* set labels */
-			if(!(label & LABEL_TRANSPARENT)) {
-				state.ray_pdf = bsdf_pdf;
-#ifdef __LAMP_MIS__
-				state.ray_t = 0.0f;
-#endif
-				state.min_ray_pdf = fminf(bsdf_pdf, state.min_ray_pdf);
-			}
-
-			/* update path state */
-			path_state_next(kg, &state, label);
-
-			/* setup ray */
-			ray.P = ray_offset(sd.P, (label & LABEL_TRANSMIT)? -sd.Ng: sd.Ng);
-			ray.D = bsdf_omega_in;
-
-#ifdef __RAY_DIFFERENTIALS__
-			ray.dP = sd.dP;
-			ray.dD = bsdf_domega_in;
-#endif
-
-#ifdef __VOLUME__
-			/* enter/exit volume */
-			if(label & LABEL_TRANSMIT)
-				kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
-#endif
 
-		}
-#ifdef __VOLUME__
-		else if(sd.flag & SD_HAS_ONLY_VOLUME) {
-			/* no surface shader but have a volume shader? act transparent */
+		/* direct lighting */
+		kernel_path_surface_connect_light(kg, rng, &sd, throughput, &state, &L);
 
-			/* update path state, count as transparent */
-			path_state_next(kg, &state, LABEL_TRANSPARENT);
-
-			/* setup ray position, direction stays unchanged */
-			ray.P = ray_offset(sd.P, -sd.Ng);
-#ifdef __RAY_DIFFERENTIALS__
-			ray.dP = sd.dP;
-#endif
-
-			/* enter/exit volume */
-			kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
-		}
-#endif
-		else {
-			/* no bsdf or volume? we're done */
+		/* compute direct lighting and next bounce */
+		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
 			break;
-		}
-
-		/* adjust ray distance for clipping */
-		if(state.bounce == 0)
-			ray.t -= sd.ray_length; /* clipping works through transparent */
-		else
-			ray.t = FLT_MAX;
 	}
 
 	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
 
 	kernel_write_light_passes(kg, buffer, &L, sample);
 
+#ifdef __KERNEL_DEBUG__
+	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
+#endif
+
 	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
 }
 
 #ifdef __BRANCHED_PATH__
 
-ccl_device_noinline void kernel_branched_path_integrate_lighting(KernelGlobals *kg,
+/* branched path tracing: bounce off surface and integrate indirect light */
+ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
 	RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust,
-	PathState *state, PathRadiance *L, ccl_global float *buffer)
+	PathState *state, PathRadiance *L)
 {
-#ifdef __EMISSION__
-	if(kernel_data.integrator.use_direct_light) {
-		bool all = kernel_data.integrator.sample_all_lights_direct;
-		kernel_branched_path_integrate_direct_lighting(kg, rng, sd, state, throughput, num_samples_adjust, L, all);
-	}
-#endif
-
 	for(int i = 0; i< sd->num_closure; i++) {
 		const ShaderClosure *sc = &sd->closure[i];
 
@@ -980,68 +764,102 @@ ccl_device_noinline void kernel_branched_path_integrate_lighting(KernelGlobals *
 		RNG bsdf_rng = cmj_hash(*rng, i);
 
 		for(int j = 0; j < num_samples; j++) {
-			/* sample BSDF */
-			float bsdf_pdf;
-			BsdfEval bsdf_eval;
-			float3 bsdf_omega_in;
-			differential3 bsdf_domega_in;
-			float bsdf_u, bsdf_v;
-			path_branched_rng_2D(kg, &bsdf_rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-			int label;
-
-			label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval,
-				&bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
+			PathState ps = *state;
+			float3 tp = throughput;
+			Ray bsdf_ray;
 
-			if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
+			if(!kernel_branched_path_surface_bounce(kg, &bsdf_rng, sd, sc, j, num_samples, &tp, &ps, L, &bsdf_ray))
 				continue;
 
-			/* modify throughput */
-			float3 tp = throughput;
-			path_radiance_bsdf_bounce(L, &tp, &bsdf_eval, bsdf_pdf, state->bounce, label);
+			kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L);
 
-			/* modify path state */
-			PathState ps = *state;
-			path_state_next(kg, &ps, label);
+			/* for render passes, sum and reset indirect light pass variables
+			 * for the next samples */
+			path_radiance_sum_indirect(L);
+			path_radiance_reset_indirect(L);
+		}
+	}
+}
 
-			/* setup ray */
-			Ray bsdf_ray;
+#ifdef __SUBSURFACE__
+ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
+                                                        ShaderData *sd,
+                                                        PathRadiance *L,
+                                                        PathState *state,
+                                                        RNG *rng,
+                                                        Ray *ray,
+                                                        float3 throughput)
+{
+	for(int i = 0; i< sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
-			bsdf_ray.P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
-			bsdf_ray.D = bsdf_omega_in;
-			bsdf_ray.t = FLT_MAX;
-#ifdef __RAY_DIFFERENTIALS__
-			bsdf_ray.dP = sd->dP;
-			bsdf_ray.dD = bsdf_domega_in;
-#endif
-#ifdef __OBJECT_MOTION__
-			bsdf_ray.time = sd->time;
-#endif
+		if(!CLOSURE_IS_BSSRDF(sc->type))
+			continue;
+
+		/* set up random number generator */
+		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
+		int num_samples = kernel_data.integrator.subsurface_samples;
+		float num_samples_inv = 1.0f/num_samples;
+		RNG bssrdf_rng = cmj_hash(*rng, i);
+
+		state->flag |= PATH_RAY_BSSRDF_ANCESTOR;
 
+		/* do subsurface scatter step with copy of shader data, this will
+		 * replace the BSSRDF with a diffuse BSDF closure */
+		for(int j = 0; j < num_samples; j++) {
+			ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
+			float bssrdf_u, bssrdf_v;
+			path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+			int num_hits = subsurface_scatter_multi_step(kg, sd, bssrdf_sd, state->flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
 #ifdef __VOLUME__
-			/* enter/exit volume */
-			if(label & LABEL_TRANSMIT)
-				kernel_volume_stack_enter_exit(kg, sd, ps.volume_stack);
+			Ray volume_ray = *ray;
+			bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
+			                                sd->flag & SD_OBJECT_INTERSECTS_VOLUME;
 #endif
 
-			/* branch RNG state */
-			path_state_branch(&ps, j, num_samples);
+			/* compute lighting with the BSDF closure */
+			for(int hit = 0; hit < num_hits; hit++) {
+				PathState hit_state = *state;
 
-			/* set MIS state */
-			ps.min_ray_pdf = fminf(bsdf_pdf, FLT_MAX);
-			ps.ray_pdf = bsdf_pdf;
-#ifdef __LAMP_MIS__
-			ps.ray_t = 0.0f;
+				path_state_branch(&hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+				if(need_update_volume_stack) {
+					/* Setup ray from previous surface point to the new one. */
+					float3 P = ray_offset(bssrdf_sd[hit].P, -bssrdf_sd[hit].Ng);
+					volume_ray.D = normalize_len(P - volume_ray.P,
+					                             &volume_ray.t);
+
+					kernel_path_subsurface_update_volume_stack(
+					    kg,
+					    &volume_ray,
+					    hit_state.volume_stack);
+
+					/* Move volume ray forward. */
+					volume_ray.P = P;
+				}
 #endif
 
-			kernel_path_indirect(kg, rng, bsdf_ray, tp*num_samples_inv, num_samples, ps, L);
+#if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
+				/* direct light */
+				if(kernel_data.integrator.use_direct_light) {
+					bool all = kernel_data.integrator.sample_all_lights_direct;
+					kernel_branched_path_surface_connect_light(kg, rng,
+						&bssrdf_sd[hit], &hit_state, throughput, num_samples_inv, L, all);
+				}
+#endif
 
-			/* for render passes, sum and reset indirect light pass variables
-			 * for the next samples */
-			path_radiance_sum_indirect(L);
-			path_radiance_reset_indirect(L);
+				/* indirect light */
+				kernel_branched_path_surface_indirect_light(kg, rng,
+					&bssrdf_sd[hit], throughput, num_samples_inv,
+					&hit_state, L);
+			}
 		}
+
+		state->flag &= ~PATH_RAY_BSSRDF_ANCESTOR;
 	}
 }
+#endif
 
 ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
 {
@@ -1053,7 +871,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 	path_radiance_init(&L, kernel_data.film.use_light_pass);
 
 	PathState state;
-	path_state_init(kg, &state, rng, sample);
+	path_state_init(kg, &state, rng, sample, &ray);
+
+#ifdef __KERNEL_DEBUG__
+	DebugData debug_data;
+	debug_data_init(&debug_data);
+#endif
 
 	for(;;) {
 		/* intersect scene */
@@ -1077,7 +900,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
 		bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
-		bool hit = scene_intersect(kg, &ray, visibility, &isect);
+		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+		if(state.flag & PATH_RAY_CAMERA) {
+			debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
+		}
 #endif
 
 #ifdef __VOLUME__
@@ -1085,10 +914,11 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		if(state.volume_stack[0].shader != SHADER_NONE) {
 			Ray volume_ray = ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
+			
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
 
-#ifdef __KERNEL_CPU__
+#ifdef __VOLUME_DECOUPLED__
 			/* decoupled ray marching only supported on CPU */
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
 
 			/* cache steps along volume for repeated sampling */
 			VolumeSegment volume_segment;
@@ -1098,29 +928,45 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			kernel_volume_decoupled_record(kg, &state,
 				&volume_ray, &volume_sd, &volume_segment, heterogeneous);
 
-			/* sample scattering */
-			int num_samples = kernel_data.integrator.volume_samples;
-			float num_samples_inv = 1.0f/num_samples;
+			/* direct light sampling */
+			if(volume_segment.closure_flag & SD_SCATTER) {
+				volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
 
-			for(int j = 0; j < num_samples; j++) {
-				/* workaround to fix correlation bug in T38710, can find better solution
-				 * in random number generator later, for now this is done here to not impact
-				 * performance of rendering without volumes */
-				RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
+				bool all = kernel_data.integrator.sample_all_lights_direct;
 
-				PathState ps = state;
-				Ray pray = ray;
-				float3 tp = throughput;
+				kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+					throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
 
-				/* branch RNG state */
-				path_state_branch(&ps, j, num_samples);
+				/* indirect light sampling */
+				int num_samples = kernel_data.integrator.volume_samples;
+				float num_samples_inv = 1.0f/num_samples;
 
-				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-					&ps, &volume_ray, &volume_sd, &tp, &tmp_rng, &volume_segment);
-				
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* todo: use all-light sampling */
-					if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
+				for(int j = 0; j < num_samples; j++) {
+					/* workaround to fix correlation bug in T38710, can find better solution
+					 * in random number generator later, for now this is done here to not impact
+					 * performance of rendering without volumes */
+					RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
+
+					PathState ps = state;
+					Ray pray = ray;
+					float3 tp = throughput;
+
+					/* branch RNG state */
+					path_state_branch(&ps, j, num_samples);
+
+					/* scatter sample. if we use distance sampling and take just one
+					 * sample for direct and indirect light, we could share this
+					 * computation, but makes code a bit complex */
+					float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
+					float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
+
+					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+						&ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
+						
+					(void)result;
+					kernel_assert(result == VOLUME_PATH_SCATTERED);
+
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
 						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
 
 						/* for render passes, sum and reset indirect light pass variables
@@ -1150,18 +996,22 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				PathState ps = state;
 				Ray pray = ray;
 				ShaderData volume_sd;
-				float3 tp = throughput;
+				float3 tp = throughput * num_samples_inv;
 
 				/* branch RNG state */
 				path_state_branch(&ps, j, num_samples);
 
-				VolumeIntegrateResult result = kernel_volume_integrate(kg, &ps,
-					&volume_sd, &volume_ray, &L, &tp, rng);
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous);
 				
+#ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
-					/* todo: use all-light sampling */
-					if(kernel_path_integrate_scatter_lighting(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
-						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
+					/* todo: support equiangular, MIS and all light sampling.
+					 * alternatively get decoupled ray marching working on the GPU */
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L);
+
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
+						kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L);
 
 						/* for render passes, sum and reset indirect light pass variables
 						 * for the next samples */
@@ -1169,6 +1019,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 						path_radiance_reset_indirect(&L);
 					}
 				}
+#endif
 			}
 
 			/* todo: avoid this calculation using decoupled ray marching */
@@ -1205,7 +1056,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
 		/* holdout */
 #ifdef __HOLDOUT__
-		if((sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK))) {
+		if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) {
 			if(kernel_data.background.transparent) {
 				float3 holdout_weight;
 				
@@ -1245,7 +1096,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				break;
 			}
 			else if(probability != 1.0f) {
-				float terminate = path_state_rng_1D(kg, rng, &state, PRNG_TERMINATE);
+				float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
 
 				if(terminate >= probability)
 					break;
@@ -1257,90 +1108,33 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			int num_samples = kernel_data.integrator.ao_samples;
-			float num_samples_inv = 1.0f/num_samples;
-			float ao_factor = kernel_data.background.ao_factor;
-			float3 ao_N;
-			float3 ao_bsdf = shader_bsdf_ao(kg, &sd, ao_factor, &ao_N);
-			float3 ao_alpha = shader_bsdf_alpha(kg, &sd);
-
-			for(int j = 0; j < num_samples; j++) {
-				float bsdf_u, bsdf_v;
-				path_branched_rng_2D(kg, rng, &state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-				float3 ao_D;
-				float ao_pdf;
-
-				sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-				if(dot(sd.Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-					Ray light_ray;
-					float3 ao_shadow;
-
-					light_ray.P = ray_offset(sd.P, sd.Ng);
-					light_ray.D = ao_D;
-					light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-					light_ray.time = sd.time;
-#endif
-					light_ray.dP = sd.dP;
-					light_ray.dD = differential3_zero();
-
-					if(!shadow_blocked(kg, &state, &light_ray, &ao_shadow))
-						path_radiance_accum_ao(&L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state.bounce);
-				}
-			}
+			kernel_branched_path_ao(kg, &sd, &L, &state, rng, throughput);
 		}
 #endif
 
 #ifdef __SUBSURFACE__
 		/* bssrdf scatter to a different location on the same object */
 		if(sd.flag & SD_BSSRDF) {
-			for(int i = 0; i< sd.num_closure; i++) {
-				ShaderClosure *sc = &sd.closure[i];
-
-				if(!CLOSURE_IS_BSSRDF(sc->type))
-					continue;
-
-				/* set up random number generator */
-				uint lcg_state = lcg_state_init(rng, &state, 0x68bc21eb);
-				int num_samples = kernel_data.integrator.subsurface_samples;
-				float num_samples_inv = 1.0f/num_samples;
-				RNG bssrdf_rng = cmj_hash(*rng, i);
-
-				state.flag |= PATH_RAY_BSSRDF_ANCESTOR;
-
-				/* do subsurface scatter step with copy of shader data, this will
-				 * replace the BSSRDF with a diffuse BSDF closure */
-				for(int j = 0; j < num_samples; j++) {
-					ShaderData bssrdf_sd[BSSRDF_MAX_HITS];
-					float bssrdf_u, bssrdf_v;
-					path_branched_rng_2D(kg, &bssrdf_rng, &state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-					int num_hits = subsurface_scatter_multi_step(kg, &sd, bssrdf_sd, state.flag, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-
-					/* compute lighting with the BSDF closure */
-					for(int hit = 0; hit < num_hits; hit++) {
-						PathState hit_state = state;
-
-						path_state_branch(&hit_state, j, num_samples);
-
-						kernel_branched_path_integrate_lighting(kg, rng,
-						                                        &bssrdf_sd[hit], throughput, num_samples_inv,
-						                                        &hit_state, &L, buffer);
-					}
-				}
-
-				state.flag &= ~PATH_RAY_BSSRDF_ANCESTOR;
-			}
+			kernel_branched_path_subsurface_scatter(kg, &sd, &L, &state,
+			                                        rng, &ray, throughput);
 		}
 #endif
 
 		if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
 			PathState hit_state = state;
 
-			/* lighting */
-			kernel_branched_path_integrate_lighting(kg, rng,
-				&sd, throughput, 1.0f, &hit_state, &L, buffer);
+#ifdef __EMISSION__
+			/* direct light */
+			if(kernel_data.integrator.use_direct_light) {
+				bool all = kernel_data.integrator.sample_all_lights_direct;
+				kernel_branched_path_surface_connect_light(kg, rng,
+					&sd, &hit_state, throughput, 1.0f, &L, all);
+			}
+#endif
+
+			/* indirect light */
+			kernel_branched_path_surface_indirect_light(kg, rng,
+				&sd, throughput, 1.0f, &hit_state, &L);
 
 			/* continue in case of transparency */
 			throughput *= shader_bsdf_transparency(kg, &sd);
@@ -1353,6 +1147,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		ray.P = ray_offset(sd.P, -sd.Ng);
 		ray.t -= sd.ray_length; /* clipping works through transparent */
 
+
+#ifdef __RAY_DIFFERENTIALS__
+		ray.dP = sd.dP;
+		ray.dD.dx = -sd.dI.dx;
+		ray.dD.dy = -sd.dI.dy;
+#endif
+
 #ifdef __VOLUME__
 		/* enter/exit volume */
 		kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
@@ -1363,6 +1164,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
 	kernel_write_light_passes(kg, buffer, &L, sample);
 
+#ifdef __KERNEL_DEBUG__
+	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
+#endif
+
 	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
 }
 
@@ -1372,11 +1177,8 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uin
 {
 	float filter_u;
 	float filter_v;
-#ifdef __CMJ__
+
 	int num_samples = kernel_data.integrator.aa_samples;
-#else
-	int num_samples = 0;
-#endif
 
 	path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
 
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index 406654c1741..f29168642a4 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -16,17 +16,13 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample)
+ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample, Ray *ray)
 {
-	state->flag = PATH_RAY_CAMERA|PATH_RAY_SINGULAR|PATH_RAY_MIS_SKIP;
+	state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP;
 
 	state->rng_offset = PRNG_BASE_NUM;
 	state->sample = sample;
-#ifdef __CMJ__
 	state->num_samples = kernel_data.integrator.aa_samples;
-#else
-	state->num_samples = 0;
-#endif
 
 	state->bounce = 0;
 	state->diffuse_bounce = 0;
@@ -45,7 +41,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG
 
 	if(kernel_data.integrator.use_volumes) {
 		/* initialize volume stack with volume we are inside of */
-		kernel_volume_stack_init(kg, state->volume_stack);
+		kernel_volume_stack_init(kg, ray, state->volume_stack);
 		/* seed RNG for cases where we can't use stratified samples */
 		state->rng_congruential = lcg_init(*rng + sample*0x51633e2d);
 	}
@@ -63,8 +59,8 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int
 		state->flag |= PATH_RAY_TRANSPARENT;
 		state->transparent_bounce++;
 
-		/* random number generator next bounce */
-		state->rng_offset += PRNG_BOUNCE_NUM;
+		/* don't increase random number generator offset here, to avoid some
+		 * unwanted patterns, see path_state_rng_1D_for_decision */
 
 		if(!kernel_data.integrator.transparent_shadows)
 			state->flag |= PATH_RAY_MIS_SKIP;
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
new file mode 100644
index 00000000000..9553c2da0df
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__)
+
+/* branched path tracing: connect path directly to position on one or more lights and add it to L */
+ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
+	ShaderData *sd, PathState *state, float3 throughput, float num_samples_adjust, PathRadiance *L, bool sample_all_lights)
+{
+#ifdef __EMISSION__
+	/* sample illumination from lights to find path contribution */
+	if(!(sd->flag & SD_BSDF_HAS_EVAL))
+		return;
+
+	Ray light_ray;
+	BsdfEval L_light;
+	bool is_lamp;
+
+#ifdef __OBJECT_MOTION__
+	light_ray.time = sd->time;
+#endif
+
+	if(sample_all_lights) {
+		/* lamp sampling */
+		for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
+			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
+			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
+			RNG lamp_rng = cmj_hash(*rng, i);
+
+			if(kernel_data.integrator.pdf_triangles != 0.0f)
+				num_samples_inv *= 0.5f;
+
+			for(int j = 0; j < num_samples; j++) {
+				float light_u, light_v;
+				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+
+				LightSample ls;
+				lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls);
+
+				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+					/* trace shadow ray */
+					float3 shadow;
+
+					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+						/* accumulate */
+						path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+					}
+				}
+			}
+		}
+
+		/* mesh light sampling */
+		if(kernel_data.integrator.pdf_triangles != 0.0f) {
+			int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
+			float num_samples_inv = num_samples_adjust/num_samples;
+
+			if(kernel_data.integrator.num_all_lights)
+				num_samples_inv *= 0.5f;
+
+			for(int j = 0; j < num_samples; j++) {
+				float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
+				float light_u, light_v;
+				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+
+				/* only sample triangle lights */
+				if(kernel_data.integrator.num_all_lights)
+					light_t = 0.5f*light_t;
+
+				LightSample ls;
+				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+
+				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+					/* trace shadow ray */
+					float3 shadow;
+
+					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+						/* accumulate */
+						path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+					}
+				}
+			}
+		}
+	}
+	else {
+		/* sample one light at random */
+		float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+		float light_u, light_v;
+		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+		LightSample ls;
+		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+
+		/* sample random light */
+		if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+			/* trace shadow ray */
+			float3 shadow;
+
+			if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+				/* accumulate */
+				path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
+			}
+		}
+	}
+#endif
+}
+
+/* branched path tracing: bounce off or through surface to with new direction stored in ray */
+ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
+	ShaderData *sd, const ShaderClosure *sc, int sample, int num_samples,
+	float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+{
+	/* sample BSDF */
+	float bsdf_pdf;
+	BsdfEval bsdf_eval;
+	float3 bsdf_omega_in;
+	differential3 bsdf_domega_in;
+	float bsdf_u, bsdf_v;
+	path_branched_rng_2D(kg, rng, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+	int label;
+
+	label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval,
+		&bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
+
+	if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
+		return false;
+
+	/* modify throughput */
+	path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+
+	/* modify path state */
+	path_state_next(kg, state, label);
+
+	/* setup ray */
+	ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
+	ray->D = bsdf_omega_in;
+	ray->t = FLT_MAX;
+#ifdef __RAY_DIFFERENTIALS__
+	ray->dP = sd->dP;
+	ray->dD = bsdf_domega_in;
+#endif
+#ifdef __OBJECT_MOTION__
+	ray->time = sd->time;
+#endif
+
+#ifdef __VOLUME__
+	/* enter/exit volume */
+	if(label & LABEL_TRANSMIT)
+		kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+#endif
+
+	/* branch RNG state */
+	path_state_branch(state, sample, num_samples);
+
+	/* set MIS state */
+	state->min_ray_pdf = fminf(bsdf_pdf, FLT_MAX);
+	state->ray_pdf = bsdf_pdf;
+#ifdef __LAMP_MIS__
+	state->ray_t = 0.0f;
+#endif
+
+	return true;
+}
+
+#endif
+
+/* path tracing: connect path directly to position on a light and add it to L */
+ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
+	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L)
+{
+#ifdef __EMISSION__
+	if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
+		return;
+
+	/* sample illumination from lights to find path contribution */
+	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+	float light_u, light_v;
+	path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+	Ray light_ray;
+	BsdfEval L_light;
+	bool is_lamp;
+
+#ifdef __OBJECT_MOTION__
+	light_ray.time = sd->time;
+#endif
+
+	LightSample ls;
+	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+
+	if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+		/* trace shadow ray */
+		float3 shadow;
+
+		if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+			/* accumulate */
+			path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+		}
+	}
+#endif
+}
+
+/* path tracing: bounce off or through surface to with new direction stored in ray */
+ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
+	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+{
+	/* no BSDF? we can stop here */
+	if(sd->flag & SD_BSDF) {
+		/* sample BSDF */
+		float bsdf_pdf;
+		BsdfEval bsdf_eval;
+		float3 bsdf_omega_in;
+		differential3 bsdf_domega_in;
+		float bsdf_u, bsdf_v;
+		path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+		int label;
+
+		label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval,
+			&bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
+
+		if(bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
+			return false;
+
+		/* modify throughput */
+		path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+
+		/* set labels */
+		if(!(label & LABEL_TRANSPARENT)) {
+			state->ray_pdf = bsdf_pdf;
+#ifdef __LAMP_MIS__
+			state->ray_t = 0.0f;
+#endif
+			state->min_ray_pdf = fminf(bsdf_pdf, state->min_ray_pdf);
+		}
+
+		/* update path state */
+		path_state_next(kg, state, label);
+
+		/* setup ray */
+		ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
+		ray->D = bsdf_omega_in;
+
+		if(state->bounce == 0)
+			ray->t -= sd->ray_length; /* clipping works through transparent */
+		else
+			ray->t = FLT_MAX;
+
+#ifdef __RAY_DIFFERENTIALS__
+		ray->dP = sd->dP;
+		ray->dD = bsdf_domega_in;
+#endif
+
+#ifdef __VOLUME__
+		/* enter/exit volume */
+		if(label & LABEL_TRANSMIT)
+			kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+#endif
+		return true;
+	}
+#ifdef __VOLUME__
+	else if(sd->flag & SD_HAS_ONLY_VOLUME) {
+		/* no surface shader but have a volume shader? act transparent */
+
+		/* update path state, count as transparent */
+		path_state_next(kg, state, LABEL_TRANSPARENT);
+
+		/* setup ray position, direction stays unchanged */
+		ray->P = ray_offset(sd->P, -sd->Ng);
+#ifdef __RAY_DIFFERENTIALS__
+		ray->dP = sd->dP;
+#endif
+
+		/* enter/exit volume */
+		kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+		return true;
+	}
+#endif
+	else {
+		/* no bsdf or volume? */
+		return false;
+	}
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
new file mode 100644
index 00000000000..d8143832294
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -0,0 +1,267 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __VOLUME_SCATTER__
+
+ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
+	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L)
+{
+#ifdef __EMISSION__
+	if(!kernel_data.integrator.use_direct_light)
+		return;
+
+	/* sample illumination from lights to find path contribution */
+	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+	float light_u, light_v;
+	path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+	Ray light_ray;
+	BsdfEval L_light;
+	LightSample ls;
+	bool is_lamp;
+
+	/* connect to light from given point where shader has been evaluated */
+#ifdef __OBJECT_MOTION__
+	light_ray.time = sd->time;
+#endif
+
+	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+	if(ls.pdf == 0.0f)
+		return;
+	
+	if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+		/* trace shadow ray */
+		float3 shadow;
+
+		if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+			/* accumulate */
+			path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+		}
+	}
+#endif
+}
+
+#ifdef __KERNEL_GPU__
+ccl_device_noinline
+#else
+ccl_device
+#endif
+bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
+	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+{
+	/* sample phase function */
+	float phase_pdf;
+	BsdfEval phase_eval;
+	float3 phase_omega_in;
+	differential3 phase_domega_in;
+	float phase_u, phase_v;
+	path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v);
+	int label;
+
+	label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval,
+		&phase_omega_in, &phase_domega_in, &phase_pdf);
+
+	if(phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval))
+		return false;
+	
+	/* modify throughput */
+	path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label);
+
+	/* set labels */
+	state->ray_pdf = phase_pdf;
+#ifdef __LAMP_MIS__
+	state->ray_t = 0.0f;
+#endif
+	state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf);
+
+	/* update path state */
+	path_state_next(kg, state, label);
+
+	/* setup ray */
+	ray->P = sd->P;
+	ray->D = phase_omega_in;
+	ray->t = FLT_MAX;
+
+#ifdef __RAY_DIFFERENTIALS__
+	ray->dP = sd->dP;
+	ray->dD = phase_domega_in;
+#endif
+
+	return true;
+}
+
+ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
+	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L,
+	float num_samples_adjust, bool sample_all_lights, Ray *ray, const VolumeSegment *segment)
+{
+#ifdef __EMISSION__
+	if(!kernel_data.integrator.use_direct_light)
+		return;
+
+	Ray light_ray;
+	BsdfEval L_light;
+	bool is_lamp;
+
+#ifdef __OBJECT_MOTION__
+	light_ray.time = sd->time;
+#endif
+
+	if(sample_all_lights) {
+		/* lamp sampling */
+		for(int i = 0; i < kernel_data.integrator.num_all_lights; i++) {
+			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
+			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
+			RNG lamp_rng = cmj_hash(*rng, i);
+
+			if(kernel_data.integrator.pdf_triangles != 0.0f)
+				num_samples_inv *= 0.5f;
+
+			for(int j = 0; j < num_samples; j++) {
+				/* sample random position on given light */
+				float light_u, light_v;
+				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+
+				LightSample ls;
+				lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls);		
+
+				float3 tp = throughput;
+
+				/* sample position on volume segment */
+				float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
+				float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+
+				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+					state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+					
+				(void)result;
+				kernel_assert(result == VOLUME_PATH_SCATTERED);
+
+				/* todo: split up light_sample so we don't have to call it again with new position */
+				lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls);
+
+				if(ls.pdf == 0.0f)
+					continue;
+
+				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+					/* trace shadow ray */
+					float3 shadow;
+
+					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+						/* accumulate */
+						path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+					}
+				}
+			}
+		}
+
+		/* mesh light sampling */
+		if(kernel_data.integrator.pdf_triangles != 0.0f) {
+			int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
+			float num_samples_inv = num_samples_adjust/num_samples;
+
+			if(kernel_data.integrator.num_all_lights)
+				num_samples_inv *= 0.5f;
+
+			for(int j = 0; j < num_samples; j++) {
+				/* sample random position on random triangle */
+				float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT);
+				float light_u, light_v;
+				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+
+				/* only sample triangle lights */
+				if(kernel_data.integrator.num_all_lights)
+					light_t = 0.5f*light_t;
+
+				LightSample ls;
+				light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls);
+
+				float3 tp = throughput;
+
+				/* sample position on volume segment */
+				float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
+				float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+
+				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+					state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+					
+				(void)result;
+				kernel_assert(result == VOLUME_PATH_SCATTERED);
+
+				/* todo: split up light_sample so we don't have to call it again with new position */
+				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+
+				if(ls.pdf == 0.0f)
+					continue;
+
+				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+					/* trace shadow ray */
+					float3 shadow;
+
+					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+						/* accumulate */
+						path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+					}
+				}
+			}
+		}
+	}
+	else {
+		/* sample random position on random light */
+		float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+		float light_u, light_v;
+		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+		LightSample ls;
+		light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls);
+
+		float3 tp = throughput;
+
+		/* sample position on volume segment */
+		float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+		float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+
+		VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+			state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+			
+		(void)result;
+		kernel_assert(result == VOLUME_PATH_SCATTERED);
+
+		/* todo: split up light_sample so we don't have to call it again with new position */
+		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+
+		if(ls.pdf == 0.0f)
+			return;
+
+		/* sample random light */
+		if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
+			/* trace shadow ray */
+			float3 shadow;
+
+			if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+				/* accumulate */
+				path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+			}
+		}
+	}
+#endif
+}
+
+#endif
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 31cb6ff6abd..236f74c0a82 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -261,22 +261,41 @@ ccl_device uint lcg_init(uint seed)
  * For branches in the path we must be careful not to reuse the same number
  * in a sequence and offset accordingly. */
 
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
 }
 
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
+{
+	/* the rng_offset is not increased for transparent bounces. if we do then
+	 * fully transparent objects can become subtly visible by the different
+	 * sampling patterns used where the transparent object is.
+	 *
+	 * however for some random numbers that will determine if we next bounce
+	 * is transparent we do need to increase the offset to avoid always making
+	 * the same decision */
+	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
+	return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
+}
+
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
 }
 
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
 }
 
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+{
+	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
+	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
+}
+
+ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
 }
@@ -290,7 +309,7 @@ ccl_device_inline void path_state_branch(PathState *state, int branch, int num_b
 	state->num_samples = state->num_samples*num_branches;
 }
 
-ccl_device_inline uint lcg_state_init(RNG *rng, PathState *state, uint scramble)
+ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble)
 {
 	return lcg_init(*rng + state->rng_offset + state->sample*scramble);
 }
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 58cec090410..db08c328d7e 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -86,9 +86,8 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 #endif
 	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* static triangle */
-		float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim);
-		float3 Ng = make_float3(Ns.x, Ns.y, Ns.z);
-		sd->shader = __float_as_int(Ns.w);
+		float3 Ng = triangle_normal(kg, sd);
+		sd->shader =  kernel_tex_fetch(__tri_shader, sd->prim);
 
 		/* vectors */
 		sd->P = triangle_refine(kg, sd, isect, ray);
@@ -166,9 +165,8 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat
 
 	/* fetch triangle data */
 	if(sd->type == PRIMITIVE_TRIANGLE) {
-		float4 Ns = kernel_tex_fetch(__tri_normal, sd->prim);
-		float3 Ng = make_float3(Ns.x, Ns.y, Ns.z);
-		sd->shader = __float_as_int(Ns.w);
+		float3 Ng = triangle_normal(kg, sd);
+		sd->shader =  kernel_tex_fetch(__tri_shader, sd->prim);
 
 		/* static triangle */
 		sd->P = triangle_refine_subsurface(kg, sd, isect, ray);
@@ -342,7 +340,7 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 	float3 P, Ng, I = make_float3(0.0f, 0.0f, 0.0f);
 	int shader;
 
-	triangle_point_normal(kg, prim, u, v, &P, &Ng, &shader);
+	triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
 
 	/* force smooth shading for displacement */
 	shader |= SHADER_SMOOTH_NORMAL;
@@ -609,6 +607,9 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn
 
 ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 {
+	if(sd->flag & SD_HAS_ONLY_VOLUME)
+		return make_float3(1.0f, 1.0f, 1.0f);
+
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
 	for(int i = 0; i< sd->num_closure; i++) {
@@ -797,8 +798,8 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 #ifdef __SVM__
 		svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, path_flag);
 #else
-		sd->closure.weight = make_float3(0.8f, 0.8f, 0.8f);
-		sd->closure.N = sd->N;
+		sd->closure->weight = make_float3(0.8f, 0.8f, 0.8f);
+		sd->closure->N = sd->N;
 		sd->flag |= bsdf_diffuse_setup(&sd->closure);
 #endif
 	}
@@ -857,7 +858,7 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con
 
 			if(phase_pdf != 0.0f) {
 				bsdf_eval_accum(result_eval, sc->type, eval);
-				sum_pdf += phase_pdf;
+				sum_pdf += phase_pdf*sc->sample_weight;
 			}
 
 			sum_sample_weight += sc->sample_weight;
@@ -1025,8 +1026,7 @@ ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect
 #ifdef __HAIR__
 	if(kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE) {
 #endif
-		float4 Ns = kernel_tex_fetch(__tri_normal, prim);
-		shader = __float_as_int(Ns.w);
+		shader = kernel_tex_fetch(__tri_shader, prim);
 #ifdef __HAIR__
 	}
 	else {
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index ab7524c411a..61954282c28 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -64,18 +64,21 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 	bool blocked;
 
 	if(kernel_data.integrator.transparent_shadows) {
+		/* check transparent bounces here, for volume scatter which can do
+		 * lighting before surface path termination is checked */
+		if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce)
+			return true;
+
 		/* intersect to find an opaque surface, or record all transparent surface hits */
 		Intersection hits_stack[STACK_MAX_HITS];
-		Intersection *hits;
+		Intersection *hits = hits_stack;
 		uint max_hits = kernel_data.integrator.transparent_max_bounce - state->transparent_bounce - 1;
 
 		/* prefer to use stack but use dynamic allocation if too deep max hits
 		 * we need max_hits + 1 storage space due to the logic in
 		 * scene_intersect_shadow_all which will first store and then check if
 		 * the limit is exceeded */
-		if(max_hits + 1 <= STACK_MAX_HITS)
-			hits = hits_stack;
-		else
+		if(max_hits + 1 > STACK_MAX_HITS)
 			hits = (Intersection*)malloc(sizeof(Intersection)*(max_hits + 1));
 
 		uint num_hits;
@@ -152,7 +155,11 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 				kernel_volume_shadow(kg, &ps, ray, &throughput);
 #endif
 
-			*shadow *= throughput;
+			*shadow = throughput;
+
+			if(hits != hits_stack)
+				free(hits);
+			return is_zero(throughput);
 		}
 
 		/* free dynamic storage */
@@ -161,11 +168,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 	}
 	else {
 		Intersection isect;
-#ifdef __HAIR__
 		blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
-#else
-		blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect);
-#endif
 	}
 
 #ifdef __VOLUME__
@@ -178,6 +181,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 	return blocked;
 }
 
+#undef STACK_MAX_HITS
+
 #else
 
 /* Shadow function to compute how much light is blocked, GPU variation.
@@ -196,11 +201,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 		return false;
 
 	Intersection isect;
-#ifdef __HAIR__
 	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
-#else
-	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect);
-#endif
 
 #ifdef __TRANSPARENT_SHADOWS__
 	if(blocked && kernel_data.integrator.transparent_shadows) {
@@ -216,11 +217,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
 				if(bounce >= kernel_data.integrator.transparent_max_bounce)
 					return true;
 
-#ifdef __HAIR__
 				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f))
-#else
-				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect))
-#endif
 				{
 
 #ifdef __VOLUME__
diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp
index 2d5f6091908..740998e8c92 100644
--- a/intern/cycles/kernel/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernel_sse2.cpp
@@ -34,7 +34,7 @@
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
-#include "kernel_displace.h"
+#include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -64,9 +64,12 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
 
 /* Shader Evaluate */
 
-void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
 {
-	kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp
index 1062fd0c990..da73a3a1c97 100644
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernel_sse3.cpp
@@ -36,7 +36,7 @@
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
-#include "kernel_displace.h"
+#include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -66,9 +66,12 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
 
 /* Shader Evaluate */
 
-void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
 {
-	kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernel_sse41.cpp
index ba3b4887650..5704f60e138 100644
--- a/intern/cycles/kernel/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernel_sse41.cpp
@@ -37,7 +37,7 @@
 #include "kernel_globals.h"
 #include "kernel_film.h"
 #include "kernel_path.h"
-#include "kernel_displace.h"
+#include "kernel_bake.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -67,9 +67,12 @@ void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, flo
 
 /* Shader Evaluate */
 
-void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i)
+void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
 {
-	kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i);
+	if(type >= SHADER_EVAL_BAKE)
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
+	else
+		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index b07075c6c95..ef46b2f707f 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -36,7 +36,7 @@ KERNEL_TEX(float4, texture_float4, __objects)
 KERNEL_TEX(float4, texture_float4, __objects_vector)
 
 /* triangles */
-KERNEL_TEX(float4, texture_float4, __tri_normal)
+KERNEL_TEX(uint, texture_uint, __tri_shader)
 KERNEL_TEX(float4, texture_float4, __tri_vnormal)
 KERNEL_TEX(float4, texture_float4, __tri_vindex)
 KERNEL_TEX(float4, texture_float4, __tri_verts)
@@ -49,6 +49,7 @@ KERNEL_TEX(float4, texture_float4, __curve_keys)
 KERNEL_TEX(uint4, texture_uint4, __attributes_map)
 KERNEL_TEX(float, texture_float, __attributes_float)
 KERNEL_TEX(float4, texture_float4, __attributes_float3)
+KERNEL_TEX(uchar4, texture_uchar4, __attributes_uchar4)
 
 /* lights */
 KERNEL_TEX(float4, texture_float4, __light_distribution)
@@ -172,10 +173,9 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_095)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_096)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_097)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_098)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_099)
 
 /* Kepler and above */
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_099)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_100)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_101)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_102)
@@ -227,7 +227,6 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_147)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_148)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_149)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_150)
-#endif
 
 /* packed image (opencl) */
 KERNEL_TEX(uchar4, texture_uchar4, __tex_image_packed)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 11445aa1c93..cfac8d1e905 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -38,12 +38,14 @@ CCL_NAMESPACE_BEGIN
 #define BSSRDF_MIN_RADIUS			1e-8f
 #define BSSRDF_MAX_HITS				4
 
-#define BB_DRAPPER				800.0f
+#define BB_DRAPER				800.0f
 #define BB_MAX_TABLE_RANGE		12000.0f
 #define BB_TABLE_XPOWER			1.5f
 #define BB_TABLE_YPOWER			5.0f
 #define BB_TABLE_SPACING		2.0f
 
+#define BECKMANN_TABLE_SIZE		256
+
 #define TEX_NUM_FLOAT_IMAGES	5
 
 #define SHADER_NONE				(~0)
@@ -64,6 +66,8 @@ CCL_NAMESPACE_BEGIN
 #define __SUBSURFACE__
 #define __CMJ__
 #define __VOLUME__
+#define __VOLUME_DECOUPLED__
+#define __VOLUME_SCATTER__
 #define __SHADOW_RECORD_ALL__
 #endif
 
@@ -71,10 +75,15 @@ CCL_NAMESPACE_BEGIN
 #define __KERNEL_SHADING__
 #define __KERNEL_ADV_SHADING__
 #define __BRANCHED_PATH__
+#define __VOLUME__
+#define __VOLUME_SCATTER__
 
 /* Experimental on GPU */
-//#define __VOLUME__
-//#define __SUBSURFACE__
+#ifdef __KERNEL_CUDA_EXPERIMENTAL__
+#define __SUBSURFACE__
+#define __CMJ__
+#endif
+
 #endif
 
 #ifdef __KERNEL_OPENCL__
@@ -101,7 +110,6 @@ CCL_NAMESPACE_BEGIN
 #define __BACKGROUND_MIS__
 #define __LAMP_MIS__
 #define __AO__
-#define __ANISOTROPIC__
 //#define __CAMERA_MOTION__
 //#define __OBJECT_MOTION__
 //#define __HAIR__
@@ -132,11 +140,9 @@ CCL_NAMESPACE_BEGIN
 #ifdef __KERNEL_SHADING__
 #define __SVM__
 #define __EMISSION__
-#define __PROCEDURAL_TEXTURES__
-#define __IMAGE_TEXTURES__
+#define __TEXTURES__
 #define __EXTRA_NODES__
 #define __HOLDOUT__
-#define __NORMAL_MAP__
 #endif
 
 #ifdef __KERNEL_ADV_SHADING__
@@ -146,12 +152,15 @@ CCL_NAMESPACE_BEGIN
 #define __BACKGROUND_MIS__
 #define __LAMP_MIS__
 #define __AO__
-#define __ANISOTROPIC__
 #define __CAMERA_MOTION__
 #define __OBJECT_MOTION__
 #define __HAIR__
 #endif
 
+#ifdef WITH_CYCLES_DEBUG
+#  define __KERNEL_DEBUG__
+#endif
+
 /* Random Numbers */
 
 typedef uint RNG;
@@ -221,10 +230,9 @@ enum PathTraceDimension {
 	PRNG_PHASE_V = 9,
 	PRNG_PHASE = 10,
 	PRNG_SCATTER_DISTANCE = 11,
-	PRNG_BOUNCE_NUM = 12,
-#else
-	PRNG_BOUNCE_NUM = 8,
 #endif
+
+	PRNG_BOUNCE_NUM = 12,
 };
 
 enum SamplingPattern {
@@ -250,17 +258,17 @@ enum PathRayFlag {
 	PATH_RAY_SHADOW_TRANSPARENT = 256,
 	PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
 
-	PATH_RAY_CURVE = 512, /* visibility flag to define curve segments*/
+	PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
+	PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
 
 	/* note that these can use maximum 12 bits, the other are for layers */
-	PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512),
+	PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024),
 
-	PATH_RAY_MIS_SKIP = 1024,
-	PATH_RAY_DIFFUSE_ANCESTOR = 2048,
-	PATH_RAY_GLOSSY_ANCESTOR = 4096,
-	PATH_RAY_BSSRDF_ANCESTOR = 8192,
-	PATH_RAY_SINGLE_PASS_DONE = 16384,
-	PATH_RAY_VOLUME_SCATTER = 32768,
+	PATH_RAY_MIS_SKIP = 2048,
+	PATH_RAY_DIFFUSE_ANCESTOR = 4096,
+	PATH_RAY_GLOSSY_ANCESTOR = 8192,
+	PATH_RAY_BSSRDF_ANCESTOR = 16384,
+	PATH_RAY_SINGLE_PASS_DONE = 32768,
 
 	/* we need layer member flags to be the 20 upper bits */
 	PATH_RAY_LAYER_SHIFT = (32-20)
@@ -283,32 +291,35 @@ typedef enum ClosureLabel {
 
 typedef enum PassType {
 	PASS_NONE = 0,
-	PASS_COMBINED = 1,
-	PASS_DEPTH = 2,
-	PASS_NORMAL = 4,
-	PASS_UV = 8,
-	PASS_OBJECT_ID = 16,
-	PASS_MATERIAL_ID = 32,
-	PASS_DIFFUSE_COLOR = 64,
-	PASS_GLOSSY_COLOR = 128,
-	PASS_TRANSMISSION_COLOR = 256,
-	PASS_DIFFUSE_INDIRECT = 512,
-	PASS_GLOSSY_INDIRECT = 1024,
-	PASS_TRANSMISSION_INDIRECT = 2048,
-	PASS_DIFFUSE_DIRECT = 4096,
-	PASS_GLOSSY_DIRECT = 8192,
-	PASS_TRANSMISSION_DIRECT = 16384,
-	PASS_EMISSION = 32768,
-	PASS_BACKGROUND = 65536,
-	PASS_AO = 131072,
-	PASS_SHADOW = 262144,
-	PASS_MOTION = 524288,
-	PASS_MOTION_WEIGHT = 1048576,
-	PASS_MIST = 2097152,
-	PASS_SUBSURFACE_DIRECT = 4194304,
-	PASS_SUBSURFACE_INDIRECT = 8388608,
-	PASS_SUBSURFACE_COLOR = 16777216,
-	PASS_LIGHT = 33554432, /* no real pass, used to force use_light_pass */
+	PASS_COMBINED = (1 << 0),
+	PASS_DEPTH = (1 << 1),
+	PASS_NORMAL = (1 << 2),
+	PASS_UV = (1 << 3),
+	PASS_OBJECT_ID = (1 << 4),
+	PASS_MATERIAL_ID = (1 << 5),
+	PASS_DIFFUSE_COLOR = (1 << 6),
+	PASS_GLOSSY_COLOR = (1 << 7),
+	PASS_TRANSMISSION_COLOR = (1 << 8),
+	PASS_DIFFUSE_INDIRECT = (1 << 9),
+	PASS_GLOSSY_INDIRECT = (1 << 10),
+	PASS_TRANSMISSION_INDIRECT = (1 << 11),
+	PASS_DIFFUSE_DIRECT = (1 << 12),
+	PASS_GLOSSY_DIRECT = (1 << 13),
+	PASS_TRANSMISSION_DIRECT = (1 << 14),
+	PASS_EMISSION = (1 << 15),
+	PASS_BACKGROUND = (1 << 16),
+	PASS_AO = (1 << 17),
+	PASS_SHADOW = (1 << 18),
+	PASS_MOTION = (1 << 19),
+	PASS_MOTION_WEIGHT = (1 << 20),
+	PASS_MIST = (1 << 21),
+	PASS_SUBSURFACE_DIRECT = (1 << 22),
+	PASS_SUBSURFACE_INDIRECT = (1 << 23),
+	PASS_SUBSURFACE_COLOR = (1 << 24),
+	PASS_LIGHT = (1 << 25), /* no real pass, used to force use_light_pass */
+#ifdef __KERNEL_DEBUG__
+	PASS_BVH_TRAVERSAL_STEPS = (1 << 26),
+#endif
 } PassType;
 
 #define PASS_ALL (~0)
@@ -330,21 +341,25 @@ typedef struct PathRadiance {
 	float3 color_glossy;
 	float3 color_transmission;
 	float3 color_subsurface;
+	float3 color_scatter;
 
 	float3 direct_diffuse;
 	float3 direct_glossy;
 	float3 direct_transmission;
 	float3 direct_subsurface;
+	float3 direct_scatter;
 
 	float3 indirect_diffuse;
 	float3 indirect_glossy;
 	float3 indirect_transmission;
 	float3 indirect_subsurface;
+	float3 indirect_scatter;
 
 	float3 path_diffuse;
 	float3 path_glossy;
 	float3 path_transmission;
 	float3 path_subsurface;
+	float3 path_scatter;
 
 	float4 shadow;
 	float mist;
@@ -358,6 +373,7 @@ typedef struct BsdfEval {
 	float3 transmission;
 	float3 transparent;
 	float3 subsurface;
+	float3 scatter;
 } BsdfEval;
 
 #else
@@ -378,7 +394,8 @@ typedef enum ShaderFlag {
 	SHADER_EXCLUDE_GLOSSY = (1 << 26),
 	SHADER_EXCLUDE_TRANSMIT = (1 << 25),
 	SHADER_EXCLUDE_CAMERA = (1 << 24),
-	SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE|SHADER_EXCLUDE_GLOSSY|SHADER_EXCLUDE_TRANSMIT|SHADER_EXCLUDE_CAMERA),
+	SHADER_EXCLUDE_SCATTER = (1 << 23),
+	SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE|SHADER_EXCLUDE_GLOSSY|SHADER_EXCLUDE_TRANSMIT|SHADER_EXCLUDE_CAMERA|SHADER_EXCLUDE_SCATTER),
 
 	SHADER_MASK = ~(SHADER_SMOOTH_NORMAL|SHADER_CAST_SHADOW|SHADER_AREA_LIGHT|SHADER_USE_MIS|SHADER_EXCLUDE_ANY)
 } ShaderFlag;
@@ -390,10 +407,8 @@ typedef enum LightType {
 	LIGHT_DISTANT,
 	LIGHT_BACKGROUND,
 	LIGHT_AREA,
-	LIGHT_AO,
 	LIGHT_SPOT,
-	LIGHT_TRIANGLE,
-	LIGHT_STRAND
+	LIGHT_TRIANGLE
 } LightType;
 
 /* Camera Type */
@@ -445,6 +460,10 @@ typedef struct Intersection {
 	int prim;
 	int object;
 	int type;
+
+#ifdef __KERNEL_DEBUG__
+	int num_traversal_steps;
+#endif
 } Intersection;
 
 /* Primitives */
@@ -478,6 +497,7 @@ typedef enum AttributeElement {
 	ATTR_ELEMENT_VERTEX,
 	ATTR_ELEMENT_VERTEX_MOTION,
 	ATTR_ELEMENT_CORNER,
+	ATTR_ELEMENT_CORNER_BYTE,
 	ATTR_ELEMENT_CURVE,
 	ATTR_ELEMENT_CURVE_KEY,
 	ATTR_ELEMENT_CURVE_KEY_MOTION,
@@ -519,24 +539,32 @@ typedef enum AttributeStandard {
 #define MAX_CLOSURE 1
 #endif
 
+/* TODO(sergey): This is rather nasty bug happening in here, which
+ * could be simply a compilers bug for which we can't find a generic
+ * platform independent workaround. Also even if it's a compiler
+ * issue, it's not so simple to upgrade the compiler in the release
+ * environment for linux and doing it so closer to the release is
+ * rather a risky business.
+ *
+ * For this release it's probably safer to stick with such a rather
+ * dirty solution, and look for a cleaner fix during the next release
+ * cycle.
+ */
 typedef struct ShaderClosure {
 	ClosureType type;
 	float3 weight;
-
+#ifndef __APPLE__
 	float sample_weight;
-
+#endif
 	float data0;
 	float data1;
+	float data2;
 
 	float3 N;
-#if defined(__ANISOTROPIC__) || defined(__SUBSURFACE__) || defined(__HAIR__)
 	float3 T;
+#ifdef __APPLE__
+	float sample_weight;
 #endif
-
-#ifdef __HAIR__
-	float offset;
-#endif
-
 #ifdef __OSL__
 	void *prim;
 #endif
@@ -563,37 +591,49 @@ typedef enum ShaderContext {
 
 enum ShaderDataFlag {
 	/* runtime flags */
-	SD_BACKFACING = 1,		/* backside of surface? */
-	SD_EMISSION = 2,		/* have emissive closure? */
-	SD_BSDF = 4,			/* have bsdf closure? */
-	SD_BSDF_HAS_EVAL = 8,	/* have non-singular bsdf closure? */
-	SD_PHASE_HAS_EVAL = 8,	/* have non-singular phase closure? */
-	SD_BSDF_GLOSSY = 16,	/* have glossy bsdf */
-	SD_BSSRDF = 32,			/* have bssrdf */
-	SD_HOLDOUT = 64,		/* have holdout closure? */
-	SD_ABSORPTION = 128,	/* have volume absorption closure? */
-	SD_SCATTER = 256,		/* have volume phase closure? */
-	SD_AO = 512,			/* have ao closure? */
-	SD_TRANSPARENT = 1024,	/* have transparent closure? */
-
-	SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY|SD_BSSRDF|SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO),
+	SD_BACKFACING     = (1 << 0),   /* backside of surface? */
+	SD_EMISSION       = (1 << 1),   /* have emissive closure? */
+	SD_BSDF           = (1 << 2),   /* have bsdf closure? */
+	SD_BSDF_HAS_EVAL  = (1 << 3),   /* have non-singular bsdf closure? */
+	SD_PHASE_HAS_EVAL = (1 << 3),   /* have non-singular phase closure? */
+	SD_BSDF_GLOSSY    = (1 << 4),   /* have glossy bsdf */
+	SD_BSSRDF         = (1 << 5),   /* have bssrdf */
+	SD_HOLDOUT        = (1 << 6),   /* have holdout closure? */
+	SD_ABSORPTION     = (1 << 7),   /* have volume absorption closure? */
+	SD_SCATTER        = (1 << 8),   /* have volume phase closure? */
+	SD_AO             = (1 << 9),   /* have ao closure? */
+	SD_TRANSPARENT    = (1 << 10),  /* have transparent closure? */
+
+	SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_GLOSSY|
+	                    SD_BSSRDF|SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO),
 
 	/* shader flags */
-	SD_USE_MIS = 2048,					/* direct light sample */
-	SD_HAS_TRANSPARENT_SHADOW = 4096,	/* has transparent shadow */
-	SD_HAS_VOLUME = 8192,				/* has volume shader */
-	SD_HAS_ONLY_VOLUME = 16384,			/* has only volume shader, no surface */
-	SD_HETEROGENEOUS_VOLUME = 32768,	/* has heterogeneous volume */
-	SD_HAS_BSSRDF_BUMP = 65536,			/* bssrdf normal uses bump */
-
-	SD_SHADER_FLAGS = (SD_USE_MIS|SD_HAS_TRANSPARENT_SHADOW|SD_HAS_VOLUME|SD_HAS_ONLY_VOLUME|SD_HETEROGENEOUS_VOLUME|SD_HAS_BSSRDF_BUMP),
+	SD_USE_MIS                = (1 << 11),  /* direct light sample */
+	SD_HAS_TRANSPARENT_SHADOW = (1 << 12),  /* has transparent shadow */
+	SD_HAS_VOLUME             = (1 << 13),  /* has volume shader */
+	SD_HAS_ONLY_VOLUME        = (1 << 14),  /* has only volume shader, no surface */
+	SD_HETEROGENEOUS_VOLUME   = (1 << 15),  /* has heterogeneous volume */
+	SD_HAS_BSSRDF_BUMP        = (1 << 16),  /* bssrdf normal uses bump */
+	SD_VOLUME_EQUIANGULAR     = (1 << 17),  /* use equiangular sampling */
+	SD_VOLUME_MIS             = (1 << 18),  /* use multiple importance sampling */
+	SD_VOLUME_CUBIC           = (1 << 19),  /* use cubic interpolation for voxels */
+
+	SD_SHADER_FLAGS = (SD_USE_MIS|SD_HAS_TRANSPARENT_SHADOW|SD_HAS_VOLUME|
+	                   SD_HAS_ONLY_VOLUME|SD_HETEROGENEOUS_VOLUME|
+	                   SD_HAS_BSSRDF_BUMP|SD_VOLUME_EQUIANGULAR|SD_VOLUME_MIS|
+	                   SD_VOLUME_CUBIC),
 
 	/* object flags */
-	SD_HOLDOUT_MASK = 131072,			/* holdout for camera rays */
-	SD_OBJECT_MOTION = 262144,			/* has object motion blur */
-	SD_TRANSFORM_APPLIED = 524288, 		/* vertices have transform applied */
-
-	SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED)
+	SD_HOLDOUT_MASK             = (1 << 20),  /* holdout for camera rays */
+	SD_OBJECT_MOTION            = (1 << 21),  /* has object motion blur */
+	SD_TRANSFORM_APPLIED        = (1 << 22),  /* vertices have transform applied */
+	SD_NEGATIVE_SCALE_APPLIED   = (1 << 23),  /* vertices have negative scale applied */
+	SD_OBJECT_HAS_VOLUME        = (1 << 24),  /* object has a volume shader */
+	SD_OBJECT_INTERSECTS_VOLUME = (1 << 25),  /* object intersects AABB of an object with volume shader */
+
+	SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED|
+	                   SD_NEGATIVE_SCALE_APPLIED|SD_OBJECT_HAS_VOLUME|
+	                   SD_OBJECT_INTERSECTS_VOLUME)
 };
 
 struct KernelGlobals;
@@ -686,9 +726,10 @@ typedef struct PathState {
 	int flag;          
 
 	/* random number generator state */
-	int rng_offset;    /* dimension offset */
-	int sample;        /* path sample number */
-	int num_samples;   /* total number of times this path will be sampled */
+	int rng_offset;    		/* dimension offset */
+	int rng_offset_bsdf;  	/* dimension offset for picking bsdf */
+	int sample;        		/* path sample number */
+	int num_samples;		/* total number of times this path will be sampled */
 
 	/* bounce counting */
 	int bounce;
@@ -756,9 +797,12 @@ typedef struct KernelCamera {
 	/* render size */
 	float width, height;
 	int resolution;
-	int pad1;
+
+	/* anamorphic lens bokeh */
+	float inv_aperture_ratio;
+
+	int is_inside_volume;
 	int pad2;
-	int pad3;
 
 	/* more matrices */
 	Transform screentoworld;
@@ -819,6 +863,11 @@ typedef struct KernelFilm {
 	float mist_start;
 	float mist_inv_depth;
 	float mist_falloff;
+
+#ifdef __KERNEL_DEBUG__
+	int pass_bvh_traversal_steps;
+	int pass_pad3, pass_pad4, pass_pad5;
+#endif
 } KernelFilm;
 
 typedef struct KernelBackground {
@@ -860,7 +909,8 @@ typedef struct KernelIntegrator {
 	int transparent_shadows;
 
 	/* caustics */
-	int no_caustics;
+	int caustics_reflective;
+	int caustics_refractive;
 	float filter_glossy;
 
 	/* seed */
@@ -892,7 +942,6 @@ typedef struct KernelIntegrator {
 	int aa_samples;
 
 	/* volume render */
-	int volume_homogeneous_sampling;
 	int use_volumes;
 	int volume_max_steps;
 	float volume_step_size;
@@ -922,7 +971,6 @@ typedef enum CurveFlag {
 } CurveFlag;
 
 typedef struct KernelCurves {
-	/* strand intersect and normal parameters - many can be changed to flags */
 	int curveflags;
 	int subdivisions;
 
@@ -930,11 +978,11 @@ typedef struct KernelCurves {
 	float maximum_width;
 } KernelCurves;
 
-typedef struct KernelBlackbody {
-	int table_offset;
-	int pad1, pad2, pad3;
-} KernelBlackbody;
-
+typedef struct KernelTables {
+	int blackbody_offset;
+	int beckmann_offset;
+	int pad1, pad2;
+} KernelTables;
 
 typedef struct KernelData {
 	KernelCamera cam;
@@ -943,9 +991,17 @@ typedef struct KernelData {
 	KernelIntegrator integrator;
 	KernelBVH bvh;
 	KernelCurves curve;
-	KernelBlackbody blackbody;
+	KernelTables tables;
 } KernelData;
 
+#ifdef __KERNEL_DEBUG__
+typedef struct DebugData {
+	// Total number of BVH node travesal steps and primitives intersections
+	// for the camera rays.
+	int num_bvh_traversal_steps;
+} DebugData;
+#endif
+
 CCL_NAMESPACE_END
 
 #endif /*  __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index faaa68e3309..ce20f20e75a 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -116,6 +116,36 @@ ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *st
 	return false;
 }
 
+ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stack)
+{
+	if(kernel_data.integrator.num_all_lights == 0)
+		return 0;
+
+	int method = -1;
+
+	for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
+		int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*2);
+
+		if(shader_flag & SD_VOLUME_MIS) {
+			return SD_VOLUME_MIS;
+		}
+		else if(shader_flag & SD_VOLUME_EQUIANGULAR) {
+			if(method == 0)
+				return SD_VOLUME_MIS;
+
+			method = SD_VOLUME_EQUIANGULAR;
+		}
+		else {
+			if(method == SD_VOLUME_EQUIANGULAR)
+				return SD_VOLUME_MIS;
+
+			method = 0;
+		}
+	}
+
+	return method;
+}
+
 /* Volume Shadows
  *
  * These functions are used to attenuate shadow rays to lights. Both absorption
@@ -136,7 +166,7 @@ ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *s
 ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
 {
 	float3 tp = *throughput;
-	const float tp_eps = 1e-10f; /* todo: this is likely not the right value */
+	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
 
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
@@ -146,6 +176,8 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 	/* compute extinction at the start */
 	float t = 0.0f;
 
+	float3 sum = make_float3(0.0f, 0.0f, 0.0f);
+
 	for(int i = 0; i < max_steps; i++) {
 		/* advance to new position */
 		float new_t = min(ray->t, (i+1) * step);
@@ -160,20 +192,26 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 
 		/* compute attenuation over segment */
 		if(volume_shader_extinction_sample(kg, sd, state, new_P, &sigma_t)) {
-			/* todo: we could avoid computing expf() for each step by summing,
-			 * because exp(a)*exp(b) = exp(a+b), but we still want a quick
-			 * tp_eps check too */
-			tp *= volume_color_transmittance(sigma_t, new_t - t);
-
-			/* stop if nearly all light blocked */
-			if(tp.x < tp_eps && tp.y < tp_eps && tp.z < tp_eps)
-				break;
+			/* Compute expf() only for every Nth step, to save some calculations
+			 * because exp(a)*exp(b) = exp(a+b), also do a quick tp_eps check then. */
+
+			sum += (-sigma_t * (new_t - t));
+			if((i & 0x07) == 0) { /* ToDo: Other interval? */
+				tp = *throughput * make_float3(expf(sum.x), expf(sum.y), expf(sum.z));
+
+				/* stop if nearly all light is blocked */
+				if(tp.x < tp_eps && tp.y < tp_eps && tp.z < tp_eps)
+					break;
+			}
 		}
 
 		/* stop if at the end of the volume */
 		t = new_t;
-		if(t == ray->t)
+		if(t == ray->t) {
+			/* Update throughput in case we haven't done it above */
+			tp = *throughput * make_float3(expf(sum.x), expf(sum.y), expf(sum.z));
 			break;
+		}
 	}
 
 	*throughput = tp;
@@ -226,33 +264,6 @@ ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float s
 	return pdf;
 }
 
-ccl_device bool kernel_volume_equiangular_light_position(KernelGlobals *kg, PathState *state, Ray *ray, RNG *rng, float3 *light_P)
-{
-	/* light RNGs */
-	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
-	float light_u, light_v;
-	path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-	/* light sample */
-	LightSample ls;
-	light_sample(kg, light_t, light_u, light_v, ray->time, ray->P, &ls);
-	if(ls.pdf == 0.0f)
-		return false;
-	
-	*light_P = ls.P;
-	return true;
-}
-
-ccl_device float kernel_volume_decoupled_equiangular_pdf(KernelGlobals *kg, PathState *state, Ray *ray, RNG *rng, float sample_t)
-{
-	float3 light_P;
-
-	if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P))
-		return 0.0f;
-
-	return kernel_volume_equiangular_pdf(ray, light_P, sample_t);
-}
-
 /* Distance sampling */
 
 ccl_device float kernel_volume_distance_sample(float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
@@ -312,7 +323,7 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe
  * the volume shading coefficient for the entire line segment */
 ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg,
 	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput,
-	RNG *rng)
+	RNG *rng, bool probalistic_scatter)
 {
 	VolumeShaderCoefficients coeff;
 
@@ -323,6 +334,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 	float t = ray->t;
 	float3 new_tp;
 
+#ifdef __VOLUME_SCATTER__
 	/* randomly scatter, and if we do t is shortened */
 	if(closure_flag & SD_SCATTER) {
 		/* extinction coefficient */
@@ -330,43 +342,41 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 
 		/* pick random color channel, we use the Veach one-sample
 		 * model with balance heuristic for the channels */
-		float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE);
+		float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
 		int channel = (int)(rphase*3.0f);
 		sd->randb_closure = rphase*3.0f - channel;
 
-		float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
-
 		/* decide if we will hit or miss */
-		float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
-		float sample_transmittance = expf(-sample_sigma_t * t);
+		bool scatter = true;
+		float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+
+		if(probalistic_scatter) {
+			float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
+			float sample_transmittance = expf(-sample_sigma_t * t);
+
+			if(1.0f - xi >= sample_transmittance) {
+				scatter = true;
+
+				/* rescale random number so we can reuse it */
+				xi = 1.0f - (1.0f - xi - sample_transmittance)/(1.0f - sample_transmittance);
 
-		if(xi >= sample_transmittance) {
+			}
+			else
+				scatter = false;
+		}
+
+		if(scatter) {
 			/* scattering */
 			float3 pdf;
 			float3 transmittance;
 			float sample_t;
 
-			/* rescale random number so we can reuse it */
-			xi = (xi - sample_transmittance)/(1.0f - sample_transmittance);
-
-			if(kernel_data.integrator.volume_homogeneous_sampling == 0 || !kernel_data.integrator.num_all_lights) { 
-				/* distance sampling */
-				sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
-			}
-			else {
-				/* equiangular sampling */
-				float3 light_P;
-				float equi_pdf;
-				if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P))
-					return VOLUME_PATH_MISSED;
-
-				sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &equi_pdf);
-				transmittance = volume_color_transmittance(sigma_t, sample_t);
-				pdf = make_float3(equi_pdf, equi_pdf, equi_pdf);
-			}
+			/* distance sampling */
+			sample_t = kernel_volume_distance_sample(ray->t, sigma_t, channel, xi, &transmittance, &pdf);
 
 			/* modifiy pdf for hit/miss decision */
-			pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t);
+			if(probalistic_scatter)
+				pdf *= make_float3(1.0f, 1.0f, 1.0f) - volume_color_transmittance(sigma_t, t);
 
 			new_tp = *throughput * coeff.sigma_s * transmittance / average(pdf);
 			t = sample_t;
@@ -378,14 +388,16 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 			new_tp = *throughput * transmittance / pdf;
 		}
 	}
-	else if(closure_flag & SD_ABSORPTION) {
+	else 
+#endif
+	if(closure_flag & SD_ABSORPTION) {
 		/* absorption only, no sampling needed */
 		float3 transmittance = volume_color_transmittance(coeff.sigma_a, t);
 		new_tp = *throughput * transmittance;
 	}
 
 	/* integrate emission attenuated by extinction */
-	if(closure_flag & SD_EMISSION) {
+	if(L && (closure_flag & SD_EMISSION)) {
 		float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
 		float3 transmittance = volume_color_transmittance(sigma_t, ray->t);
 		float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t);
@@ -408,13 +420,15 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 	return VOLUME_PATH_ATTENUATED;
 }
 
-/* heterogeneous volume: integrate stepping through the volume until we
- * reach the end, get absorbed entirely, or run out of iterations */
-ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlobals *kg,
+/* heterogeneous volume distance sampling: integrate stepping through the
+ * volume until we reach the end, get absorbed entirely, or run out of
+ * iterations. this does probalistically scatter or get transmitted through
+ * for path tracing where we don't want to branch. */
+ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
 	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng)
 {
 	float3 tp = *throughput;
-	const float tp_eps = 1e-10f; /* todo: this is likely not the right value */
+	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
 
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
@@ -425,9 +439,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 	float t = 0.0f;
 	float3 accum_transmittance = make_float3(1.0f, 1.0f, 1.0f);
 
-	/* cache some constant variables */
-	float xi;
-	int channel = -1;
+	/* pick random color channel, we use the Veach one-sample
+	 * model with balance heuristic for the channels */
+	float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+	float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+	int channel = (int)(rphase*3.0f);
+	sd->randb_closure = rphase*3.0f - channel;
 	bool has_scatter = false;
 
 	for(int i = 0; i < max_steps; i++) {
@@ -449,25 +466,14 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 			float3 transmittance;
 			bool scatter = false;
 
-			/* randomly scatter, and if we do dt and new_t are shortened */
+			/* distance sampling */
+#ifdef __VOLUME_SCATTER__
 			if((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_ABSORPTION))) {
 				has_scatter = true;
 
-				/* average sigma_t and sigma_s over segment */
 				float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
 				float3 sigma_s = coeff.sigma_s;
 
-				/* lazily set up variables for sampling */
-				if(channel == -1) {
-					/* pick random color channel, we use the Veach one-sample
-					 * model with balance heuristic for the channels */
-					xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
-
-					float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE);
-					channel = (int)(rphase*3.0f);
-					sd->randb_closure = rphase*3.0f - channel;
-				}
-
 				/* compute transmittance over full step */
 				transmittance = volume_color_transmittance(sigma_t, dt);
 
@@ -480,10 +486,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 					float new_dt = -logf(1.0f - xi)/sample_sigma_t;
 					new_t = t + new_dt;
 
-					/* transmittance, throughput */
+					/* transmittance and pdf */
 					float3 new_transmittance = volume_color_transmittance(sigma_t, new_dt);
-					float pdf = average(sigma_t * new_transmittance);
-					new_tp = tp * sigma_s * new_transmittance / pdf;
+					float3 pdf = sigma_t * new_transmittance;
+
+					/* throughput */
+					new_tp = tp * sigma_s * new_transmittance / average(pdf);
 					scatter = true;
 				}
 				else {
@@ -495,7 +503,9 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 					xi = 1.0f - (1.0f - xi)/sample_transmittance;
 				}
 			}
-			else if(closure_flag & SD_ABSORPTION) {
+			else 
+#endif
+			if(closure_flag & SD_ABSORPTION) {
 				/* absorption only, no sampling needed */
 				float3 sigma_a = coeff.sigma_a;
 
@@ -504,7 +514,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 			}
 
 			/* integrate emission attenuated by absorption */
-			if(closure_flag & SD_EMISSION) {
+			if(L && (closure_flag & SD_EMISSION)) {
 				float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt);
 				path_radiance_accum_emission(L, tp, emission, state->bounce);
 			}
@@ -518,19 +528,19 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 					tp = make_float3(0.0f, 0.0f, 0.0f);
 					break;
 				}
+			}
 
-				/* prepare to scatter to new direction */
-				if(scatter) {
-					/* adjust throughput and move to new location */
-					sd->P = ray->P + new_t*ray->D;
-					*throughput = tp;
+			/* prepare to scatter to new direction */
+			if(scatter) {
+				/* adjust throughput and move to new location */
+				sd->P = ray->P + new_t*ray->D;
+				*throughput = tp;
 
-					return VOLUME_PATH_SCATTERED;
-				}
-				else {
-					/* accumulate transmittance */
-					accum_transmittance *= transmittance;
-				}
+				return VOLUME_PATH_SCATTERED;
+			}
+			else {
+				/* accumulate transmittance */
+				accum_transmittance *= transmittance;
 			}
 		}
 
@@ -545,14 +555,34 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous(KernelGlo
 	return VOLUME_PATH_ATTENUATED;
 }
 
+/* get the volume attenuation and emission over line segment defined by
+ * ray, with the assumption that there are no surfaces blocking light
+ * between the endpoints. distance sampling is used to decide if we will
+ * scatter or not. */
+ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
+	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng, bool heterogeneous)
+{
+	/* workaround to fix correlation bug in T38710, can find better solution
+	 * in random number generator later, for now this is done here to not impact
+	 * performance of rendering without volumes */
+	RNG tmp_rng = cmj_hash(*rng, state->rng_offset);
+
+	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
+
+	if(heterogeneous)
+		return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, &tmp_rng);
+	else
+		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng, true);
+}
+
 /* Decoupled Volume Sampling
  *
  * VolumeSegment is list of coefficients and transmittance stored at all steps
  * through a volume. This can then latter be used for decoupled sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media" */
-
-/* CPU only because of malloc/free */
-#ifdef __KERNEL_CPU__
+ * "Importance Sampling Techniques for Path Tracing in Participating Media"
+ *
+ * On the GPU this is only supported for homogeneous volumes (1 step), due to
+ * no support for malloc/free and too much stack usage with a fix size array. */
 
 typedef struct VolumeStep {
 	float3 sigma_s;				/* scatter coefficient */
@@ -571,6 +601,8 @@ typedef struct VolumeSegment {
 
 	float3 accum_emission;		/* accumulated emission at end of segment */
 	float3 accum_transmittance;	/* accumulated transmittance at end of segment */
+
+	int sampling_method;		/* volume sampling method */
 } VolumeSegment;
 
 /* record volume steps to the end of the volume.
@@ -578,10 +610,12 @@ typedef struct VolumeSegment {
  * it would be nice if we could only record up to the point that we need to scatter,
  * but the entire segment is needed to do always scattering, rather than probalistically
  * hitting or missing the volume. if we don't know the transmittance at the end of the
- * volume we can't generate stratitied distance samples up to that transmittance */
+ * volume we can't generate stratified distance samples up to that transmittance */
 ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state,
 	Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous)
 {
+	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
+
 	/* prepare for volume stepping */
 	int max_steps;
 	float step_size, random_jitter_offset;
@@ -608,6 +642,7 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 
 	segment->closure_flag = 0;
 	segment->numsteps = 0;
+
 	segment->steps = (VolumeStep*)malloc(sizeof(VolumeStep)*max_steps);
 
 	VolumeStep *step = segment->steps;
@@ -669,6 +704,10 @@ ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *sta
 		t = new_t;
 		if(t == ray->t)
 			break;
+
+		/* stop if nearly all light blocked */
+		if(accum_transmittance.x < tp_eps && accum_transmittance.y < tp_eps && accum_transmittance.z < tp_eps)
+			break;
 	}
 
 	/* store total emission and transmittance */
@@ -698,35 +737,70 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s
  * scattering, they always scatter if there is any non-zero scattering
  * coefficient.
  *
- * these also do not do emission or modify throughput. */
+ * these also do not do emission or modify throughput. 
+ * 
+ * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */
 ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd,
-	float3 *throughput, RNG *rng, VolumeSegment *segment)
+	float3 *throughput, float rphase, float rscatter,
+	const VolumeSegment *segment, const float3 *light_P, bool probalistic_scatter)
 {
-	int closure_flag = segment->closure_flag;
-
-	if(!(closure_flag & SD_SCATTER))
-		return VOLUME_PATH_MISSED;
+	kernel_assert(segment->closure_flag & SD_SCATTER);
 
 	/* pick random color channel, we use the Veach one-sample
 	 * model with balance heuristic for the channels */
-	float rphase = path_state_rng_1D(kg, rng, state, PRNG_PHASE);
 	int channel = (int)(rphase*3.0f);
 	sd->randb_closure = rphase*3.0f - channel;
+	float xi = rscatter;
 
-	float xi = path_state_rng_1D(kg, rng, state, PRNG_SCATTER_DISTANCE);
+	/* probalistic scattering decision based on transmittance */
+	if(probalistic_scatter) {
+		float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel);
+
+		if(1.0f - xi >= sample_transmittance) {
+			/* rescale random number so we can reuse it */
+			xi = 1.0f - (1.0f - xi - sample_transmittance)/(1.0f - sample_transmittance);
+		}
+		else {
+			*throughput /= sample_transmittance;
+			return VOLUME_PATH_MISSED;
+		}
+	}
 
 	VolumeStep *step;
 	float3 transmittance;
 	float pdf, sample_t;
+	float mis_weight = 1.0f;
+	bool distance_sample = true;
+	bool use_mis = false;
+
+	if(segment->sampling_method && light_P) {
+		if(segment->sampling_method == SD_VOLUME_MIS) {
+			/* multiple importance sample: randomly pick between
+			 * equiangular and distance sampling strategy */
+			if(xi < 0.5f) {
+				xi *= 2.0f;
+			}
+			else {
+				xi = (xi - 0.5f)*2.0f;
+				distance_sample = false;
+			}
+
+			use_mis = true;
+		}
+		else {
+			/* only equiangular sampling */
+			distance_sample = false;
+		}
+	}
 
 	/* distance sampling */
-	if(kernel_data.integrator.volume_homogeneous_sampling == 0 || !kernel_data.integrator.num_all_lights) { 
+	if(distance_sample) {
 		/* find step in cdf */
 		step = segment->steps;
 
 		float prev_t = 0.0f;
-		float3 step_pdf = make_float3(1.0f, 1.0f, 1.0f);
+		float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f);
 
 		if(segment->numsteps > 1) {
 			float prev_cdf = 0.0f;
@@ -749,7 +823,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 			xi = (xi - prev_cdf)/(step_cdf - prev_cdf);
 
 			/* pdf for picking step */
-			step_pdf = step->cdf_distance - prev_cdf_distance;
+			step_pdf_distance = step->cdf_distance - prev_cdf_distance;
 		}
 
 		/* determine range in which we will sample */
@@ -758,35 +832,77 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 		/* sample distance and compute transmittance */
 		float3 distance_pdf;
 		sample_t = prev_t + kernel_volume_distance_sample(step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf);
-		pdf = average(distance_pdf * step_pdf);
+
+		/* modifiy pdf for hit/miss decision */
+		if(probalistic_scatter)
+			distance_pdf *= make_float3(1.0f, 1.0f, 1.0f) - segment->accum_transmittance;
+
+		pdf = average(distance_pdf * step_pdf_distance);
+
+		/* multiple importance sampling */
+		if(use_mis) {
+			float equi_pdf = kernel_volume_equiangular_pdf(ray, *light_P, sample_t);
+			mis_weight = 2.0f*power_heuristic(pdf, equi_pdf);
+		}
 	}
 	/* equi-angular sampling */
 	else {
-		/* pick position on light */
-		float3 light_P;
-		if(!kernel_volume_equiangular_light_position(kg, state, ray, rng, &light_P))
-			return VOLUME_PATH_MISSED;
-
 		/* sample distance */
-		sample_t = kernel_volume_equiangular_sample(ray, light_P, xi, &pdf);
+		sample_t = kernel_volume_equiangular_sample(ray, *light_P, xi, &pdf);
 
 		/* find step in which sampled distance is located */
 		step = segment->steps;
 
 		float prev_t = 0.0f;
+		float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f);
 
 		if(segment->numsteps > 1) {
-			/* todo: optimize using binary search */
-			for(int i = 0; i < segment->numsteps-1; i++, step++) {
-				if(sample_t < step->t)
+			float3 prev_cdf_distance = make_float3(0.0f, 0.0f, 0.0f);
+
+			int numsteps = segment->numsteps;
+			int high = numsteps - 1;
+			int low = 0;
+			int mid;
+
+			while(low < high) {
+				mid = (low + high) >> 1;
+
+				if(sample_t < step[mid].t)
+					high = mid;
+				else if(sample_t >= step[mid + 1].t)
+					low = mid + 1;
+				else {
+					/* found our interval in step[mid] .. step[mid+1] */
+					prev_t = step[mid].t;
+					prev_cdf_distance = step[mid].cdf_distance;
+					step += mid+1;
 					break;
+				}
+			}
 
-				prev_t = step->t;
+			if(low >= numsteps - 1) {
+				prev_t = step[numsteps - 1].t;
+				prev_cdf_distance = step[numsteps-1].cdf_distance;
+				step += numsteps - 1;
 			}
+
+			/* pdf for picking step with distance sampling */
+			step_pdf_distance = step->cdf_distance - prev_cdf_distance;
 		}
-		
+
+		/* determine range in which we will sample */
+		float step_t = step->t - prev_t;
+		float step_sample_t = sample_t - prev_t;
+
 		/* compute transmittance */
-		transmittance = volume_color_transmittance(step->sigma_t, sample_t - prev_t);
+		transmittance = volume_color_transmittance(step->sigma_t, step_sample_t);
+
+		/* multiple importance sampling */
+		if(use_mis) {
+			float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t);
+			float distance_pdf = average(distance_pdf3 * step_pdf_distance);
+			mis_weight = 2.0f*power_heuristic(pdf, distance_pdf);
+		}
 	}
 
 	/* compute transmittance up to this step */
@@ -794,7 +910,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 		transmittance *= (step-1)->accum_transmittance;
 
 	/* modify throughput */
-	*throughput *= step->sigma_s * transmittance / pdf;
+	*throughput *= step->sigma_s * transmittance * (mis_weight / pdf);
 
 	/* evaluate shader to create closures at shading point */
 	if(segment->numsteps > 1) {
@@ -810,40 +926,27 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	return VOLUME_PATH_SCATTERED;
 }
 
-#endif
-
-/* get the volume attenuation and emission over line segment defined by
- * ray, with the assumption that there are no surfaces blocking light
- * between the endpoints */
-ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
-	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng)
+/* decide if we need to use decoupled or not */
+ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method)
 {
-	/* workaround to fix correlation bug in T38710, can find better solution
-	 * in random number generator later, for now this is done here to not impact
-	 * performance of rendering without volumes */
-	RNG tmp_rng = cmj_hash(*rng, state->rng_offset);
-	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
-
-#if 0
-	/* debugging code to compare decoupled ray marching */
-	VolumeSegment segment;
-
-	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
-	kernel_volume_decoupled_record(kg, state, ray, sd, &segment, heterogeneous);
-
-	VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, throughput, &tmp_rng, &segment);
-
-	kernel_volume_decoupled_free(kg, &segment);
+	/* decoupled ray marching for heterogenous volumes not supported on the GPU,
+	 * which also means equiangular and multiple importance sampling is not
+	 * support for that case */
+#ifdef __KERNEL_GPU__
+	if(heterogeneous)
+		return false;
+#endif
 
-	return result;
-#else
-	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
+	/* equiangular and multiple importance sampling only implemented for decoupled */
+	if(sampling_method != 0)
+		return true;
 
-	if(heterogeneous)
-		return kernel_volume_integrate_heterogeneous(kg, state, ray, sd, L, throughput, &tmp_rng);
+	/* for all light sampling use decoupled, reusing shader evaluations is
+	 * typically faster in that case */
+	if(direct)
+		return kernel_data.integrator.sample_all_lights_direct;
 	else
-		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng);
-#endif
+		return kernel_data.integrator.sample_all_lights_indirect;
 }
 
 /* Volume Stack
@@ -851,17 +954,88 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals
  * This is an array of object/shared ID's that the current segment of the path
  * is inside of. */
 
-ccl_device void kernel_volume_stack_init(KernelGlobals *kg, VolumeStack *stack)
+ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
+                                         Ray *ray,
+                                         VolumeStack *stack)
 {
-	/* todo: this assumes camera is always in air, need to detect when it isn't */
-	if(kernel_data.background.volume_shader == SHADER_NONE) {
-		stack[0].shader = SHADER_NONE;
+	/* NULL ray happens in the baker, does it need proper initialization of
+	 * camera in volume?
+	 */
+	if(!kernel_data.cam.is_inside_volume || ray == NULL) {
+		/* Camera is guaranteed to be in the air, only take background volume
+		 * into account in this case.
+		 */
+		if(kernel_data.background.volume_shader != SHADER_NONE) {
+			stack[0].shader = kernel_data.background.volume_shader;
+			stack[0].object = PRIM_NONE;
+			stack[1].shader = SHADER_NONE;
+		}
+		else {
+			stack[0].shader = SHADER_NONE;
+		}
+		return;
 	}
-	else {
+
+	Ray volume_ray = *ray;
+	volume_ray.t = FLT_MAX;
+
+	int stack_index = 0, enclosed_index = 0;
+	int enclosed_volumes[VOLUME_STACK_SIZE];
+
+	while(stack_index < VOLUME_STACK_SIZE - 1 &&
+	      enclosed_index < VOLUME_STACK_SIZE - 1)
+	{
+		Intersection isect;
+		if(!scene_intersect_volume(kg, &volume_ray, &isect)) {
+			break;
+		}
+
+		ShaderData sd;
+		shader_setup_from_ray(kg, &sd, &isect, &volume_ray, 0, 0);
+		if(sd.flag & SD_HAS_VOLUME) {
+			if(sd.flag & SD_BACKFACING) {
+				/* If ray exited the volume and never entered to that volume
+				 * it means that camera is inside such a volume.
+				 */
+				bool is_enclosed = false;
+				for(int i = 0; i < enclosed_index; ++i) {
+					if(enclosed_volumes[i] == sd.object) {
+						is_enclosed = true;
+						break;
+					}
+				}
+				if(is_enclosed == false) {
+					stack[stack_index].object = sd.object;
+					stack[stack_index].shader = sd.shader;
+					++stack_index;
+				}
+			}
+			else {
+				/* If ray from camera enters the volume, this volume shouldn't
+				 * be added to the stak on exit.
+				 */
+				enclosed_volumes[enclosed_index++] = sd.object;
+			}
+		}
+
+		/* Move ray forward. */
+		volume_ray.P = ray_offset(sd.P, -sd.Ng);
+	}
+	/* stack_index of 0 means quick checks outside of the kernel gave false
+	 * positive, nothing to worry about, just we've wasted quite a few of
+	 * ticks just to come into conclusion that camera is in the air.
+	 *
+	 * In this case we're doing the same above -- check whether background has
+	 * volume.
+	 */
+	if(stack_index == 0 && kernel_data.background.volume_shader == SHADER_NONE) {
 		stack[0].shader = kernel_data.background.volume_shader;
 		stack[0].object = PRIM_NONE;
 		stack[1].shader = SHADER_NONE;
 	}
+	else {
+		stack[stack_index].shader = SHADER_NONE;
+	}
 }
 
 ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, VolumeStack *stack)
@@ -910,4 +1084,3 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/osl/SConscript b/intern/cycles/kernel/osl/SConscript
index 4685bb7753e..d721edbaf6e 100644
--- a/intern/cycles/kernel/osl/SConscript
+++ b/intern/cycles/kernel/osl/SConscript
@@ -43,6 +43,9 @@ defs.append('CCL_NAMESPACE_BEGIN=namespace ccl {')
 defs.append('CCL_NAMESPACE_END=}')
 defs.append('WITH_OSL')
 
+if env['WITH_BF_CYCLES_DEBUG']:
+    defs.append('WITH_CYCLES_DEBUG')
+
 if env['OURPLATFORM'] in ('win32-vc', 'win64-vc'):
     cxxflags.append('-DBOOST_NO_RTTI -DBOOST_NO_TYPEID /fp:fast'.split())
     incs.append(env['BF_PTHREADS_INC'])
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 94337290d20..84ef85e089d 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -66,18 +66,6 @@ ClosureParam *closure_bssrdf_cubic_params()
 	static ClosureParam params[] = {
 		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, sc.N),
 		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius),
-		//CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.data1),
-	    CLOSURE_STRING_KEYPARAM("label"),
-	    CLOSURE_FINISH_PARAM(CubicBSSRDFClosure)
-	};
-	return params;
-}
-
-ClosureParam *closure_bssrdf_cubic_extended_params()
-{
-	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, sc.N),
-		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius),
 		CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.data1),
 		CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.T.x),
 	    CLOSURE_STRING_KEYPARAM("label"),
@@ -107,18 +95,6 @@ ClosureParam *closure_bssrdf_gaussian_params()
 	static ClosureParam params[] = {
 		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, sc.N),
 		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius),
-		//CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, sc.data1),
-	    CLOSURE_STRING_KEYPARAM("label"),
-	    CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure)
-	};
-	return params;
-}
-
-ClosureParam *closure_bssrdf_gaussian_extended_params()
-{
-	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, sc.N),
-		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius),
 		CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, sc.data1),
 	    CLOSURE_STRING_KEYPARAM("label"),
 	    CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure)
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index a96c0e2b1fb..cc9942b024e 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -41,6 +41,8 @@
 #include "util_param.h"
 
 #include "kernel_types.h"
+#include "kernel_compat_cpu.h"
+#include "kernel_globals.h"
 #include "kernel_montecarlo.h"
 
 #include "closure/bsdf_util.h"
@@ -51,8 +53,7 @@
 #include "closure/bsdf_reflection.h"
 #include "closure/bsdf_refraction.h"
 #include "closure/bsdf_transparent.h"
-#include "closure/bsdf_ward.h"
-#include "closure/bsdf_westin.h"
+#include "closure/bsdf_ashikhmin_shirley.h"
 #include "closure/bsdf_toon.h"
 #include "closure/bsdf_hair.h"
 #include "closure/volume.h"
@@ -85,16 +86,6 @@ BSDF_CLOSURE_CLASS_BEGIN(Refraction, refraction, refraction, LABEL_SINGULAR)
 	CLOSURE_FLOAT_PARAM(RefractionClosure, sc.data0),
 BSDF_CLOSURE_CLASS_END(Refraction, refraction)
 
-BSDF_CLOSURE_CLASS_BEGIN(WestinBackscatter, westin_backscatter, westin_backscatter, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(WestinBackscatterClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(WestinBackscatterClosure, sc.data0),
-BSDF_CLOSURE_CLASS_END(WestinBackscatter, westin_backscatter)
-
-BSDF_CLOSURE_CLASS_BEGIN(WestinSheen, westin_sheen, westin_sheen, LABEL_DIFFUSE)
-	CLOSURE_FLOAT3_PARAM(WestinSheenClosure, sc.N),
-	CLOSURE_FLOAT_PARAM(WestinSheenClosure, sc.data0),
-BSDF_CLOSURE_CLASS_END(WestinSheen, westin_sheen)
-
 BSDF_CLOSURE_CLASS_BEGIN(Transparent, transparent, transparent, LABEL_SINGULAR)
 BSDF_CLOSURE_CLASS_END(Transparent, transparent)
 
@@ -103,12 +94,12 @@ BSDF_CLOSURE_CLASS_BEGIN(AshikhminVelvet, ashikhmin_velvet, ashikhmin_velvet, LA
 	CLOSURE_FLOAT_PARAM(AshikhminVelvetClosure, sc.data0),
 BSDF_CLOSURE_CLASS_END(AshikhminVelvet, ashikhmin_velvet)
 
-BSDF_CLOSURE_CLASS_BEGIN(Ward, ward, ward, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(WardClosure, sc.N),
-	CLOSURE_FLOAT3_PARAM(WardClosure, sc.T),
-	CLOSURE_FLOAT_PARAM(WardClosure, sc.data0),
-	CLOSURE_FLOAT_PARAM(WardClosure, sc.data1),
-BSDF_CLOSURE_CLASS_END(Ward, ward)
+BSDF_CLOSURE_CLASS_BEGIN(AshikhminShirley, ashikhmin_shirley_aniso, ashikhmin_shirley, LABEL_GLOSSY|LABEL_REFLECT)
+	CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, sc.N),
+	CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, sc.T),
+	CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, sc.data0),
+	CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, sc.data1),
+BSDF_CLOSURE_CLASS_END(AshikhminShirley, ashikhmin_shirley_aniso)
 
 BSDF_CLOSURE_CLASS_BEGIN(DiffuseToon, diffuse_toon, diffuse_toon, LABEL_DIFFUSE)
 	CLOSURE_FLOAT3_PARAM(DiffuseToonClosure, sc.N),
@@ -122,26 +113,40 @@ BSDF_CLOSURE_CLASS_BEGIN(GlossyToon, glossy_toon, glossy_toon, LABEL_GLOSSY)
 	CLOSURE_FLOAT_PARAM(GlossyToonClosure, sc.data1),
 BSDF_CLOSURE_CLASS_END(GlossyToon, glossy_toon)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGX, microfacet_ggx, microfacet_ggx, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGX, microfacet_ggx, microfacet_ggx, LABEL_GLOSSY|LABEL_REFLECT)
 	CLOSURE_FLOAT3_PARAM(MicrofacetGGXClosure, sc.N),
 	CLOSURE_FLOAT_PARAM(MicrofacetGGXClosure, sc.data0),
 BSDF_CLOSURE_CLASS_END(MicrofacetGGX, microfacet_ggx)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmann, microfacet_beckmann, microfacet_beckmann, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXAniso, microfacet_ggx_aniso, microfacet_ggx, LABEL_GLOSSY|LABEL_REFLECT)
+	CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, sc.N),
+	CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, sc.T),
+	CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, sc.data0),
+	CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, sc.data1),
+BSDF_CLOSURE_CLASS_END(MicrofacetGGXAniso, microfacet_ggx_aniso)
+
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmann, microfacet_beckmann, microfacet_beckmann, LABEL_GLOSSY|LABEL_REFLECT)
 	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannClosure, sc.N),
 	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannClosure, sc.data0),
 BSDF_CLOSURE_CLASS_END(MicrofacetBeckmann, microfacet_beckmann)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXRefraction, microfacet_ggx_refraction, microfacet_ggx, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannAniso, microfacet_beckmann_aniso, microfacet_beckmann, LABEL_GLOSSY|LABEL_REFLECT)
+	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, sc.N),
+	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, sc.T),
+	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, sc.data0),
+	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, sc.data1),
+BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannAniso, microfacet_beckmann_aniso)
+
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXRefraction, microfacet_ggx_refraction, microfacet_ggx, LABEL_GLOSSY|LABEL_TRANSMIT)
 	CLOSURE_FLOAT3_PARAM(MicrofacetGGXRefractionClosure, sc.N),
 	CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data0),
-	CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data1),
+	CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data2),
 BSDF_CLOSURE_CLASS_END(MicrofacetGGXRefraction, microfacet_ggx_refraction)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction, microfacet_beckmann, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction, microfacet_beckmann, LABEL_GLOSSY|LABEL_TRANSMIT)
 	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannRefractionClosure, sc.N),
 	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data0),
-	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data1),
+	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data2),
 BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction)
 
 BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, hair_reflection, LABEL_GLOSSY)
@@ -150,7 +155,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, hair_reflection, LABEL
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1),
 #ifdef __HAIR__
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.T),
-	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.offset),
+	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data2),
 #else
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1),
@@ -163,7 +168,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, hair_transmission,
 	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, sc.data1),
 #ifdef __HAIR__
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.T),
-	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.offset),
+	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data2),
 #else
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, sc.N),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, sc.data1),
@@ -210,26 +215,24 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		bsdf_transparent_params(), bsdf_transparent_prepare);
 	register_closure(ss, "microfacet_ggx", id++,
 		bsdf_microfacet_ggx_params(), bsdf_microfacet_ggx_prepare);
+	register_closure(ss, "microfacet_ggx_aniso", id++,
+		bsdf_microfacet_ggx_aniso_params(), bsdf_microfacet_ggx_aniso_prepare);
 	register_closure(ss, "microfacet_ggx_refraction", id++,
 		bsdf_microfacet_ggx_refraction_params(), bsdf_microfacet_ggx_refraction_prepare);
 	register_closure(ss, "microfacet_beckmann", id++,
 		bsdf_microfacet_beckmann_params(), bsdf_microfacet_beckmann_prepare);
+	register_closure(ss, "microfacet_beckmann_aniso", id++,
+		bsdf_microfacet_beckmann_aniso_params(), bsdf_microfacet_beckmann_aniso_prepare);
 	register_closure(ss, "microfacet_beckmann_refraction", id++,
 		bsdf_microfacet_beckmann_refraction_params(), bsdf_microfacet_beckmann_refraction_prepare);
-	register_closure(ss, "ward", id++,
-		bsdf_ward_params(), bsdf_ward_prepare);
+	register_closure(ss, "ashikhmin_shirley", id++,
+		bsdf_ashikhmin_shirley_aniso_params(), bsdf_ashikhmin_shirley_aniso_prepare);
 	register_closure(ss, "ashikhmin_velvet", id++,
 		bsdf_ashikhmin_velvet_params(), bsdf_ashikhmin_velvet_prepare);
 	register_closure(ss, "diffuse_toon", id++,
 		bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare);
 	register_closure(ss, "glossy_toon", id++,
 		bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare);
-	register_closure(ss, "specular_toon", id++,
-		bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare);
-	register_closure(ss, "westin_backscatter", id++,
-		bsdf_westin_backscatter_params(), bsdf_westin_backscatter_prepare);
-	register_closure(ss, "westin_sheen", id++,
-		bsdf_westin_sheen_params(), bsdf_westin_sheen_prepare);
 
 	register_closure(ss, "emission", id++,
 		closure_emission_params(), closure_emission_prepare);
@@ -247,10 +250,6 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		closure_bssrdf_cubic_params(), closure_bssrdf_cubic_prepare);
 	register_closure(ss, "bssrdf_gaussian", id++,
 		closure_bssrdf_gaussian_params(), closure_bssrdf_gaussian_prepare);
-	register_closure(ss, "bssrdf_cubic", id++,
-		closure_bssrdf_cubic_extended_params(), closure_bssrdf_cubic_prepare);
-	register_closure(ss, "bssrdf_gaussian", id++,
-		closure_bssrdf_gaussian_extended_params(), closure_bssrdf_gaussian_prepare);
 
 	register_closure(ss, "hair_reflection", id++,
 		bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare);
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index 218cf1c19cc..5e833d738d8 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -48,12 +48,8 @@ OSL::ClosureParam *closure_holdout_params();
 OSL::ClosureParam *closure_ambient_occlusion_params();
 OSL::ClosureParam *closure_bsdf_diffuse_ramp_params();
 OSL::ClosureParam *closure_bsdf_phong_ramp_params();
-OSL::ClosureParam *closure_westin_backscatter_params();
-OSL::ClosureParam *closure_westin_sheen_params();
 OSL::ClosureParam *closure_bssrdf_cubic_params();
 OSL::ClosureParam *closure_bssrdf_gaussian_params();
-OSL::ClosureParam *closure_bssrdf_cubic_extended_params();
-OSL::ClosureParam *closure_bssrdf_gaussian_extended_params();
 OSL::ClosureParam *closure_henyey_greenstein_volume_params();
 
 void closure_emission_prepare(OSL::RendererServices *, int id, void *data);
@@ -62,8 +58,6 @@ void closure_holdout_prepare(OSL::RendererServices *, int id, void *data);
 void closure_ambient_occlusion_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_diffuse_ramp_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data);
-void closure_westin_backscatter_prepare(OSL::RendererServices *, int id, void *data);
-void closure_westin_sheen_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data);
 void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data);
@@ -149,17 +143,18 @@ public: \
 \
 	void blur(float roughness) \
 	{ \
-		bsdf_##svmlower##_blur(&sc, roughness); \
 	} \
 \
 	float3 eval_reflect(const float3 &omega_out, const float3 &omega_in, float& pdf) const \
 	{ \
-		return bsdf_##svmlower##_eval_reflect(&sc, omega_out, omega_in, &pdf); \
+		pdf = 0; \
+		return make_float3(0, 0, 0); \
 	} \
 \
 	float3 eval_transmit(const float3 &omega_out, const float3 &omega_in, float& pdf) const \
 	{ \
-		return bsdf_##svmlower##_eval_transmit(&sc, omega_out, omega_in, &pdf); \
+		pdf = 0; \
+		return make_float3(0, 0, 0); \
 	} \
 \
 	int sample(const float3 &Ng, \
@@ -168,8 +163,8 @@ public: \
 	           float3 &omega_in, float3 &domega_in_dx, float3 &domega_in_dy, \
 	           float &pdf, float3 &eval) const \
 	{ \
-		return bsdf_##svmlower##_sample(&sc, Ng, omega_out, domega_out_dx, domega_out_dy, \
-			randu, randv, &eval, &omega_in, &domega_in_dx, &domega_in_dy, &pdf); \
+		pdf = 0; \
+		return LABEL_NONE; \
 	} \
 }; \
 \
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 5a658d8244a..9c3134e41c9 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -20,7 +20,6 @@
 #ifdef WITH_OSL
 
 #include <OSL/oslexec.h>
-#include <cmath>
 
 #include "util_map.h"
 #include "util_param.h"
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 54894ea19eb..a9694651e14 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -126,7 +126,7 @@ void OSLRenderServices::thread_init(KernelGlobals *kernel_globals_, OSL::Texture
 	osl_ts = osl_ts_;
 }
 
-bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time)
+bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time)
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
@@ -156,7 +156,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr
 	return false;
 }
 
-bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time)
+bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time)
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
@@ -186,7 +186,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::Transform
 	return false;
 }
 
-bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from, float time)
+bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from, float time)
 {
 	KernelGlobals *kg = kernel_globals;
 
@@ -218,7 +218,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from, float ti
 	return false;
 }
 
-bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time)
+bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to, float time)
 {
 	KernelGlobals *kg = kernel_globals;
 
@@ -250,7 +250,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to, fl
 	return false;
 }
 
-bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform)
+bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform)
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
@@ -275,7 +275,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr
 	return false;
 }
 
-bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform)
+bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform)
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
@@ -300,7 +300,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::Transform
 	return false;
 }
 
-bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from)
+bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from)
 {
 	KernelGlobals *kg = kernel_globals;
 
@@ -328,7 +328,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from)
 	return false;
 }
 
-bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to)
+bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to)
 {
 	KernelGlobals *kg = kernel_globals;
 	
@@ -356,7 +356,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to)
 	return false;
 }
 
-bool OSLRenderServices::get_array_attribute(void *renderstate, bool derivatives, 
+bool OSLRenderServices::get_array_attribute(OSL::ShaderGlobals *sg, bool derivatives, 
                                             ustring object, TypeDesc type, ustring name,
                                             int index, void *val)
 {
@@ -479,7 +479,7 @@ static bool set_attribute_int(int i, TypeDesc type, bool derivatives, void *val)
 
 static bool set_attribute_string(ustring str, TypeDesc type, bool derivatives, void *val)
 {
-	if(type.basetype == TypeDesc::INT && type.aggregate == TypeDesc::SCALAR && type.arraylen == 0) {
+	if(type.basetype == TypeDesc::STRING && type.aggregate == TypeDesc::SCALAR && type.arraylen == 0) {
 		ustring *sval = (ustring *)val;
 		sval[0] = str;
 
@@ -718,7 +718,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
 		return set_attribute_int(f, type, derivatives, val);
 	}
 	else if (name == u_path_transparent_depth) {
-		/* Ray Depth */
+		/* Transparent Ray Depth */
 		int f = sd->transparent_depth;
 		return set_attribute_int(f, type, derivatives, val);
 	}
@@ -751,14 +751,22 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
 		return false;
 }
 
-bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustring object_name,
+bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name,
+                                      TypeDesc type, ustring name, void *val)
+{
+	if (sg->renderstate == NULL)
+		return false;
+
+	ShaderData *sd = (ShaderData *)(sg->renderstate);
+	return get_attribute(sd, derivatives, object_name, type, name, val);
+}
+
+bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring object_name,
                                       TypeDesc type, ustring name, void *val)
 {
-	ShaderData *sd = (ShaderData *)renderstate;
 	KernelGlobals *kg = sd->osl_globals;
 	bool is_curve;
 	int object;
-	// int prim;
 
 	/* lookup of attribute on another object */
 	if (object_name != u_empty) {
@@ -768,12 +776,10 @@ bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustri
 			return false;
 
 		object = it->second;
-		// prim = PRIM_NONE;
 		is_curve = false;
 	}
 	else {
 		object = sd->object;
-		// prim = sd->prim;
 		is_curve = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 
 		if (object == OBJECT_NONE)
@@ -815,12 +821,12 @@ bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustri
 }
 
 bool OSLRenderServices::get_userdata(bool derivatives, ustring name, TypeDesc type, 
-                                     void *renderstate, void *val)
+                                     OSL::ShaderGlobals *sg, void *val)
 {
 	return false; /* disabled by lockgeom */
 }
 
-bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, void *renderstate)
+bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, OSL::ShaderGlobals *sg)
 {
 	return false; /* never called by OSL */
 }
@@ -871,14 +877,30 @@ bool OSLRenderServices::texture(ustring filename, TextureOpt &options,
 		return true;
 	}
 #endif
+	bool status;
 
-	OSLThreadData *tdata = kg->osl_tdata;
-	OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info;
+	if(filename[0] == '@' && filename.find('.') == -1) {
+        int slot = atoi(filename.c_str() + 1);
+		float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t);
 
-	OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info);
+		result[0] = rgba[0];
+		if(options.nchannels > 1)
+			result[1] = rgba[1];
+		if(options.nchannels > 2)
+			result[2] = rgba[2];
+		if(options.nchannels > 3)
+			result[3] = rgba[3];
+		status = true;
+	}
+	else {
+		OSLThreadData *tdata = kg->osl_tdata;
+		OIIO::TextureSystem::Perthread *thread_info = tdata->oiio_thread_info;
 
-	bool status = ts->texture(th, thread_info,
-	                          options, s, t, dsdx, dtdx, dsdy, dtdy, result);
+		OIIO::TextureSystem::TextureHandle *th = ts->get_texture_handle(filename, thread_info);
+
+		status = ts->texture(th, thread_info,
+		                     options, s, t, dsdx, dtdx, dsdy, dtdy, result);
+	}
 
 	if(!status) {
 		if(options.nchannels == 3 || options.nchannels == 4) {
@@ -953,7 +975,7 @@ bool OSLRenderServices::environment(ustring filename, TextureOpt &options,
 	return status;
 }
 
-bool OSLRenderServices::get_texture_info(ustring filename, int subimage,
+bool OSLRenderServices::get_texture_info(OSL::ShaderGlobals *sg, ustring filename, int subimage,
                                          ustring dataname,
                                          TypeDesc datatype, void *data)
 {
@@ -996,7 +1018,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
 
 	ray.P = TO_FLOAT3(P);
 	ray.D = TO_FLOAT3(R);
-	ray.t = (options.maxdist == 1.0e30)? FLT_MAX: options.maxdist - options.mindist;
+	ray.t = (options.maxdist == 1.0e30f)? FLT_MAX: options.maxdist - options.mindist;
 	ray.time = sd->time;
 
 	if(options.mindist == 0.0f) {
@@ -1025,11 +1047,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
 	tracedata->sd.osl_globals = sd->osl_globals;
 
 	/* raytrace */
-#ifdef __HAIR__
 	return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
-#else
-	return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect);
-#endif
 }
 
 
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 069722d81b6..6f928a0d103 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -49,27 +49,29 @@ public:
 	
 	void thread_init(KernelGlobals *kernel_globals, OSL::TextureSystem *ts);
 
-	bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time);
-	bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time);
+	bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time);
+	bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time);
 	
-	bool get_matrix(OSL::Matrix44 &result, ustring from, float time);
-	bool get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time);
+	bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from, float time);
+	bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to, float time);
 	
-	bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform);
-	bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform);
+	bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform);
+	bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform);
 	
-	bool get_matrix(OSL::Matrix44 &result, ustring from);
-	bool get_inverse_matrix(OSL::Matrix44 &result, ustring from);
+	bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from);
+	bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from);
 
-	bool get_array_attribute(void *renderstate, bool derivatives,
+	bool get_array_attribute(OSL::ShaderGlobals *sg, bool derivatives,
 	                         ustring object, TypeDesc type, ustring name,
 	                         int index, void *val);
-	bool get_attribute(void *renderstate, bool derivatives, ustring object,
+	bool get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object,
+	                   TypeDesc type, ustring name, void *val);
+	bool get_attribute(ShaderData *sd, bool derivatives, ustring object_name,
 	                   TypeDesc type, ustring name, void *val);
 
 	bool get_userdata(bool derivatives, ustring name, TypeDesc type,
-	                  void *renderstate, void *val);
-	bool has_userdata(ustring name, TypeDesc type, void *renderstate);
+	                  OSL::ShaderGlobals *sg, void *val);
+	bool has_userdata(ustring name, TypeDesc type, OSL::ShaderGlobals *sg);
 
 	int pointcloud_search(OSL::ShaderGlobals *sg, ustring filename, const OSL::Vec3 &center,
 	                      float radius, int max_points, bool sort, size_t *out_indices,
@@ -106,7 +108,7 @@ public:
 	                 OSL::ShaderGlobals *sg, const OSL::Vec3 &R,
 	                 const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy, float *result);
 
-	bool get_texture_info(ustring filename, int subimage,
+	bool get_texture_info(OSL::ShaderGlobals *sg, ustring filename, int subimage,
 	                      ustring dataname, TypeDesc datatype, void *data);
 
 	static bool get_background_attribute(KernelGlobals *kg, ShaderData *sd, ustring name,
@@ -157,6 +159,70 @@ public:
 	static ustring u_v;
 	static ustring u_empty;
 
+#if OSL_LIBRARY_VERSION_CODE < 10500
+	bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) {
+		return get_matrix(NULL, result, xform, time);
+	}
+
+	bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) {
+		return get_inverse_matrix(NULL, result, xform, time);
+	}
+
+	bool get_matrix(OSL::Matrix44 &result, ustring from, float time) {
+		return get_matrix(NULL, result, from, time);
+	}
+
+	bool get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time) {
+		return get_inverse_matrix(NULL, result, to, time);
+	}
+
+	bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) {
+		return get_matrix(NULL, result, xform);
+	}
+
+	bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) {
+		return get_inverse_matrix(NULL, result, xform);
+	}
+
+	bool get_matrix(OSL::Matrix44 &result, ustring from) {
+		return get_matrix(NULL, result, from);
+	}
+
+	bool get_inverse_matrix(OSL::Matrix44 &result, ustring to) {
+		return get_inverse_matrix(NULL, result, to);
+	}
+
+	bool get_array_attribute(void *renderstate, bool derivatives,
+	                         ustring object, TypeDesc type, ustring name,
+	                         int index, void *val) {
+		OSL::ShaderGlobals sg;
+		sg.renderstate = renderstate;
+		return get_array_attribute(&sg, derivatives,
+		                           object, type, name,
+		                           index, val);
+	}
+
+	bool get_attribute(void *renderstate, bool derivatives, ustring object_name,
+	                   TypeDesc type, ustring name, void *val) {
+		OSL::ShaderGlobals sg;
+		sg.renderstate = renderstate;
+		return get_attribute(&sg, derivatives, object_name, type, name, val);
+	}
+
+	bool has_userdata(ustring name, TypeDesc type, void *renderstate) {
+		return has_userdata(name, type, (OSL::ShaderGlobals *) renderstate);
+	}
+
+	bool get_userdata(bool derivatives, ustring name, TypeDesc type,
+	                  void *renderstate, void *val) {
+		return get_userdata(derivatives, name, type, (OSL::ShaderGlobals *) renderstate, val);
+	}
+
+	bool get_texture_info(ustring filename, int subimage,
+	                      ustring dataname, TypeDesc datatype, void *data) {
+		return get_texture_info(NULL, filename, subimage, dataname, datatype, data);
+	}
+#endif
 private:
 	KernelGlobals *kernel_globals;
 	OSL::TextureSystem *osl_ts;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 843dcdd0985..ca0c2cc4415 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -14,6 +14,8 @@
  * limitations under the License
  */
 
+#include <OSL/oslexec.h>
+
 #include "kernel_compat_cpu.h"
 #include "kernel_montecarlo.h"
 #include "kernel_types.h"
@@ -34,7 +36,6 @@
 
 #include "attribute.h"
 
-#include <OSL/oslexec.h>
 
 CCL_NAMESPACE_BEGIN
 
@@ -164,11 +165,14 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 					CBSDFClosure *bsdf = (CBSDFClosure *)prim;
 					int scattering = bsdf->scattering();
 
-					/* no caustics option */
-					if(scattering == LABEL_GLOSSY && (path_flag & PATH_RAY_DIFFUSE)) {
+					/* caustic options */
+					if((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
 						KernelGlobals *kg = sd->osl_globals;
-						if(kernel_data.integrator.no_caustics)
+
+						if((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
+						   (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) {
 							return;
+						}
 					}
 
 					/* sample weight */
@@ -181,12 +185,9 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 					sc.T = bsdf->sc.T;
 					sc.data0 = bsdf->sc.data0;
 					sc.data1 = bsdf->sc.data1;
+					sc.data2 = bsdf->sc.data2;
 					sc.prim = bsdf->sc.prim;
 
-#ifdef __HAIR__
-					sc.offset = bsdf->sc.offset;
-#endif
-
 					/* add */
 					if(sc.sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
 						sd->closure[sd->num_closure++] = sc;
@@ -202,6 +203,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 					sc.type = CLOSURE_EMISSION_ID;
 					sc.data0 = 0.0f;
 					sc.data1 = 0.0f;
+					sc.data2 = 0.0f;
 					sc.prim = NULL;
 
 					/* flag */
@@ -219,6 +221,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 					sc.type = CLOSURE_AMBIENT_OCCLUSION_ID;
 					sc.data0 = 0.0f;
 					sc.data1 = 0.0f;
+					sc.data2 = 0.0f;
 					sc.prim = NULL;
 
 					if(sd->num_closure < MAX_CLOSURE) {
@@ -232,6 +235,7 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 					sc.type = CLOSURE_HOLDOUT_ID;
 					sc.data0 = 0.0f;
 					sc.data1 = 0.0f;
+					sc.data2 = 0.0f;
 					sc.prim = NULL;
 
 					if(sd->num_closure < MAX_CLOSURE) {
diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt
index 5518d652bf9..0b735ede701 100644
--- a/intern/cycles/kernel/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/shaders/CMakeLists.txt
@@ -4,6 +4,7 @@
 set(SRC_OSL
 	node_add_closure.osl
 	node_ambient_occlusion.osl
+	node_anisotropic_bsdf.osl
 	node_attribute.osl
 	node_background.osl
 	node_brick_texture.osl
@@ -13,6 +14,7 @@ set(SRC_OSL
 	node_checker_texture.osl
 	node_combine_rgb.osl
 	node_combine_hsv.osl
+	node_combine_xyz.osl
 	node_convert_from_color.osl
 	node_convert_from_float.osl
 	node_convert_from_int.osl
@@ -57,6 +59,7 @@ set(SRC_OSL
 	node_rgb_ramp.osl
 	node_separate_rgb.osl
 	node_separate_hsv.osl
+	node_separate_xyz.osl
 	node_set_normal.osl
 	node_sky_texture.osl
 	node_subsurface_scattering.osl
@@ -71,7 +74,6 @@ set(SRC_OSL
 	node_vector_transform.osl
 	node_velvet_bsdf.osl
 	node_voronoi_texture.osl
-	node_ward_bsdf.osl
 	node_wavelength.osl
 	node_blackbody.osl
 	node_wave_texture.osl
diff --git a/intern/cycles/kernel/shaders/node_ward_bsdf.osl b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
index 2d360d594f2..da1e4f77107 100644
--- a/intern/cycles/kernel/shaders/node_ward_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_anisotropic_bsdf.osl
@@ -16,8 +16,9 @@
 
 #include "stdosl.h"
 
-shader node_ward_bsdf(
+shader node_anisotropic_bsdf(
 	color Color = 0.0,
+	string distribution = "GGX",
 	float Roughness = 0.0,
 	float Anisotropy = 0.0,
 	float Rotation = 0.0,
@@ -44,6 +45,13 @@ shader node_ward_bsdf(
 		RoughnessV = Roughness / (1.0 - aniso);
 	}
 
-	BSDF = Color * ward(Normal, T, RoughnessU, RoughnessV);
+	if (distribution == "Sharp")
+		BSDF = Color * reflection(Normal);
+	else if (distribution == "Beckmann")
+		BSDF = Color * microfacet_beckmann_aniso(Normal, T, RoughnessU, RoughnessV);
+	else if (distribution == "GGX")
+		BSDF = Color * microfacet_ggx_aniso(Normal, T, RoughnessU, RoughnessV);
+	else
+		BSDF = Color * ashikhmin_shirley(Normal, T, RoughnessU, RoughnessV);
 }
 
diff --git a/intern/cycles/kernel/shaders/node_brick_texture.osl b/intern/cycles/kernel/shaders/node_brick_texture.osl
index 70a6a6ea7ce..c9fb3542aef 100644
--- a/intern/cycles/kernel/shaders/node_brick_texture.osl
+++ b/intern/cycles/kernel/shaders/node_brick_texture.osl
@@ -93,6 +93,6 @@ shader node_brick_texture(
 		Col[2] = facm * (Color1[2]) + tint * Color2[2];
 	}
 	
-	Color = (Fac == 1.0) ? Mortar: Col;
+	Color = (Fac == 1.0) ? Mortar : Col;
 }
 
diff --git a/intern/cycles/kernel/shaders/node_checker_texture.osl b/intern/cycles/kernel/shaders/node_checker_texture.osl
index 6723076723c..a6d21fd36f3 100644
--- a/intern/cycles/kernel/shaders/node_checker_texture.osl
+++ b/intern/cycles/kernel/shaders/node_checker_texture.osl
@@ -21,9 +21,9 @@
 
 float checker(point p)
 {
-	p[0] = (p[0] + 0.00001) * 0.9999;
-	p[1] = (p[1] + 0.00001) * 0.9999;
-	p[2] = (p[2] + 0.00001) * 0.9999;
+	p[0] = (p[0] + 0.000001) * 0.999999;
+	p[1] = (p[1] + 0.000001) * 0.999999;
+	p[2] = (p[2] + 0.000001) * 0.999999;
 	
 	int xi = (int)fabs(floor(p[0]));
 	int yi = (int)fabs(floor(p[1]));
diff --git a/intern/cycles/kernel/shaders/node_combine_xyz.osl b/intern/cycles/kernel/shaders/node_combine_xyz.osl
new file mode 100644
index 00000000000..933dee5bd78
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_combine_xyz.osl
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#include "stdosl.h"
+
+shader node_combine_xyz(
+	float X = 0.0,
+	float Y = 0.0,
+	float Z = 0.0,
+	output vector Vector = 0.8)
+{
+	Vector = vector(X, Y, Z);
+}
+
diff --git a/intern/cycles/kernel/shaders/node_emission.osl b/intern/cycles/kernel/shaders/node_emission.osl
index 2428da5ef4e..b28d731c19f 100644
--- a/intern/cycles/kernel/shaders/node_emission.osl
+++ b/intern/cycles/kernel/shaders/node_emission.osl
@@ -17,14 +17,10 @@
 #include "stdosl.h"
 
 shader node_emission(
-	int TotalPower = 0,
 	color Color = 0.8,
 	float Strength = 1.0,
 	output closure color Emission = 0)
 {
-	if (TotalPower)
-		Emission = ((Strength / surfacearea()) * Color) * emission();
-	else
-		Emission = (Strength * Color) * emission();
+	Emission = (Strength * Color) * emission();
 }
 
diff --git a/intern/cycles/kernel/shaders/node_fresnel.h b/intern/cycles/kernel/shaders/node_fresnel.h
index 447a84255ef..d192c5d02de 100644
--- a/intern/cycles/kernel/shaders/node_fresnel.h
+++ b/intern/cycles/kernel/shaders/node_fresnel.h
@@ -34,3 +34,16 @@ float fresnel_dielectric_cos(float cosi, float eta)
 	return result;
 }
 
+color fresnel_conductor(float cosi, color eta, color k)
+{
+	color cosi2 = color(cosi * cosi);
+	color one = color(1, 1, 1);
+	color tmp_f = eta * eta + k * k;
+	color tmp = tmp_f * cosi2;
+	color Rparl2 = (tmp - (2.0 * eta * cosi) + one) /
+	               (tmp + (2.0 * eta * cosi) + one);
+	color Rperp2 = (tmp_f - (2.0 * eta * cosi) + cosi2) /
+	               (tmp_f + (2.0 * eta * cosi) + cosi2);
+	return (Rparl2 + Rperp2) * 0.5;
+}
+
diff --git a/intern/cycles/kernel/shaders/node_geometry.osl b/intern/cycles/kernel/shaders/node_geometry.osl
index dbdf55802ae..cd68f07b21e 100644
--- a/intern/cycles/kernel/shaders/node_geometry.osl
+++ b/intern/cycles/kernel/shaders/node_geometry.osl
@@ -49,12 +49,8 @@ shader node_geometry(
 
 	/* try to create spherical tangent from generated coordinates */
 	if (getattribute("geom:generated", generated)) {
-		matrix project = matrix(0.0, 1.0, 0.0, 0.0,
-		                        -1.0, 0.0, 0.0, 0.0,
-		                        0.0, 0.0, 0.0, 0.0,
-		                        0.5, -0.5, 0.0, 1.0);
-
-		vector T = transform("object", "world", transform(project, generated));
+		normal data = normal(-(generated[1] - 0.5), (generated[0] - 0.5), 0.0);
+		vector T = transform("object", "world", data);
 		Tangent = cross(Normal, normalize(cross(T, Normal)));
 	}
 	else {
diff --git a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
index b4e0fe62223..5c727ca6917 100644
--- a/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_glossy_bsdf.osl
@@ -19,7 +19,7 @@
 
 shader node_glossy_bsdf(
 	color Color = 0.8,
-	string distribution = "Beckmann",
+	string distribution = "GGX",
 	float Roughness = 0.2,
 	normal Normal = N,
 	output closure color BSDF = 0)
@@ -30,6 +30,8 @@ shader node_glossy_bsdf(
 		BSDF = Color * microfacet_beckmann(Normal, Roughness);
 	else if (distribution == "GGX")
 		BSDF = Color * microfacet_ggx(Normal, Roughness);
+	else
+		BSDF = Color * ashikhmin_shirley(Normal, vector(0, 0, 0), Roughness, Roughness);
 
 }
 
diff --git a/intern/cycles/kernel/shaders/node_image_texture.osl b/intern/cycles/kernel/shaders/node_image_texture.osl
index 7238a1e8862..18b5fb4b31f 100644
--- a/intern/cycles/kernel/shaders/node_image_texture.osl
+++ b/intern/cycles/kernel/shaders/node_image_texture.osl
@@ -113,6 +113,10 @@ shader node_image_texture(
 				weight[2] = ((2.0 - limit) * Nob[2] + (limit - 1.0)) / (2.0 * limit - 1.0);
 			}
 		}
+		else {
+			/* Desperate mode, no valid choice anyway, fallback to one side.*/
+			weight[0] = 1.0;
+		}
 
 		Color = color(0.0, 0.0, 0.0);
 		Alpha = 0.0;
diff --git a/intern/cycles/kernel/shaders/node_musgrave_texture.osl b/intern/cycles/kernel/shaders/node_musgrave_texture.osl
index 60762539002..a32c3d4b1b8 100644
--- a/intern/cycles/kernel/shaders/node_musgrave_texture.osl
+++ b/intern/cycles/kernel/shaders/node_musgrave_texture.osl
@@ -35,14 +35,14 @@ float noise_musgrave_fBm(point p, string basis, float H, float lacunarity, float
 	int i;
 
 	for (i = 0; i < (int)octaves; i++) {
-		value += safe_noise(p, 0) * pwr;
+		value += safe_noise(p, "signed") * pwr;
 		pwr *= pwHL;
 		p *= lacunarity;
 	}
 
 	rmd = octaves - floor(octaves);
 	if (rmd != 0.0)
-		value += rmd * safe_noise(p, 0) * pwr;
+		value += rmd * safe_noise(p, "signed") * pwr;
 
 	return value;
 }
@@ -63,14 +63,14 @@ float noise_musgrave_multi_fractal(point p, string basis, float H, float lacunar
 	int i;
 
 	for (i = 0; i < (int)octaves; i++) {
-		value *= (pwr * safe_noise(p, 0) + 1.0);
+		value *= (pwr * safe_noise(p, "signed") + 1.0);
 		pwr *= pwHL;
 		p *= lacunarity;
 	}
 
 	rmd = octaves - floor(octaves);
 	if (rmd != 0.0)
-		value *= (rmd * pwr * safe_noise(p, 0) + 1.0); /* correct? */
+		value *= (rmd * pwr * safe_noise(p, "signed") + 1.0); /* correct? */
 
 	return value;
 }
@@ -91,11 +91,11 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna
 	int i;
 
 	/* first unscaled octave of function; later octaves are scaled */
-	value = offset + safe_noise(p, 0);
+	value = offset + safe_noise(p, "signed");
 	p *= lacunarity;
 
 	for (i = 1; i < (int)octaves; i++) {
-		increment = (safe_noise(p, 0) + offset) * pwr * value;
+		increment = (safe_noise(p, "signed") + offset) * pwr * value;
 		value += increment;
 		pwr *= pwHL;
 		p *= lacunarity;
@@ -103,7 +103,7 @@ float noise_musgrave_hetero_terrain(point p, string basis, float H, float lacuna
 
 	rmd = octaves - floor(octaves);
 	if (rmd != 0.0) {
-		increment = (safe_noise(p, 0) + offset) * pwr * value;
+		increment = (safe_noise(p, "signed") + offset) * pwr * value;
 		value += rmd * increment;
 	}
 
@@ -126,7 +126,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
 	float pwr = pwHL;
 	int i;
 
-	result = safe_noise(p, 0) + offset;
+	result = safe_noise(p, "signed") + offset;
 	weight = gain * result;
 	p *= lacunarity;
 
@@ -134,7 +134,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
 		if (weight > 1.0)
 			weight = 1.0;
 
-		signal = (safe_noise(p, 0) + offset) * pwr;
+		signal = (safe_noise(p, "signed") + offset) * pwr;
 		pwr *= pwHL;
 		result += weight * signal;
 		weight *= gain * signal;
@@ -143,7 +143,7 @@ float noise_musgrave_hybrid_multi_fractal(point p, string basis, float H,
 
 	rmd = octaves - floor(octaves);
 	if (rmd != 0.0)
-		result += rmd * ((safe_noise(p, 0) + offset) * pwr);
+		result += rmd * ((safe_noise(p, "signed") + offset) * pwr);
 
 	return result;
 }
@@ -164,7 +164,7 @@ float noise_musgrave_ridged_multi_fractal(point p, string basis, float H,
 	float pwr = pwHL;
 	int i;
 
-	signal = offset - fabs(safe_noise(p, 0));
+	signal = offset - fabs(safe_noise(p, "signed"));
 	signal *= signal;
 	result = signal;
 	weight = 1.0;
@@ -172,7 +172,7 @@ float noise_musgrave_ridged_multi_fractal(point p, string basis, float H,
 	for (i = 1; i < (int)octaves; i++) {
 		p *= lacunarity;
 		weight = clamp(signal * gain, 0.0, 1.0);
-		signal = offset - fabs(safe_noise(p, 0));
+		signal = offset - fabs(safe_noise(p, "signed"));
 		signal *= signal;
 		signal *= weight;
 		result += signal * pwr;
diff --git a/intern/cycles/kernel/shaders/node_separate_xyz.osl b/intern/cycles/kernel/shaders/node_separate_xyz.osl
new file mode 100644
index 00000000000..63725cb9995
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_separate_xyz.osl
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+#include "stdosl.h"
+
+shader node_separate_xyz(
+	vector Vector = 0.8,
+	output float X = 0.0,
+	output float Y = 0.0,
+	output float Z = 0.0)
+{
+	X = Vector[0];
+	Y = Vector[1];
+	Z = Vector[2];
+}
diff --git a/intern/cycles/kernel/shaders/node_texture.h b/intern/cycles/kernel/shaders/node_texture.h
index de51559f297..2710eed414a 100644
--- a/intern/cycles/kernel/shaders/node_texture.h
+++ b/intern/cycles/kernel/shaders/node_texture.h
@@ -153,12 +153,12 @@ float voronoi_CrS(point p) { return 2.0 * voronoi_Cr(p) - 1.0; }
 
 /* Noise Bases */
 
-float safe_noise(point p, int type)
+float safe_noise(point p, string type)
 {
 	float f = 0.0;
 	
 	/* Perlin noise in range -1..1 */
-	if (type == 0)
+	if (type == "signed")
 		f = noise("perlin", p);
 	
 	/* Perlin noise in range 0..1 */
@@ -175,7 +175,7 @@ float safe_noise(point p, int type)
 float noise_basis(point p, string basis)
 {
 	if (basis == "Perlin")
-		return safe_noise(p, 1);
+		return safe_noise(p, "unsigned");
 	if (basis == "Voronoi F1")
 		return voronoi_F1S(p);
 	if (basis == "Voronoi F2")
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index 6f824ea8ebd..1ff8f363b49 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -476,17 +476,17 @@ closure color diffuse_ramp(normal N, color colors[8]) BUILTIN;
 closure color phong_ramp(normal N, float exponent, color colors[8]) BUILTIN;
 closure color diffuse_toon(normal N, float size, float smooth) BUILTIN;
 closure color glossy_toon(normal N, float size, float smooth) BUILTIN;
-closure color westin_backscatter(normal N, float roughness) BUILTIN;
-closure color westin_sheen(normal N, float edginess) BUILTIN;
 closure color translucent(normal N) BUILTIN;
 closure color reflection(normal N) BUILTIN;
 closure color refraction(normal N, float eta) BUILTIN;
 closure color transparent() BUILTIN;
 closure color microfacet_ggx(normal N, float ag) BUILTIN;
+closure color microfacet_ggx_aniso(normal N, vector T, float ax, float ay) BUILTIN;
 closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN;
 closure color microfacet_beckmann(normal N, float ab) BUILTIN;
+closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN;
 closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN;
-closure color ward(normal N, vector T,float ax, float ay) BUILTIN;
+closure color ashikhmin_shirley(normal N, vector T,float ax, float ay) BUILTIN;
 closure color ashikhmin_velvet(normal N, float sigma) BUILTIN;
 closure color emission() BUILTIN;
 closure color background() BUILTIN;
@@ -505,12 +505,8 @@ closure color hair_transmission(normal N, float roughnessu, float roughnessv, ve
 closure color henyey_greenstein(float g) BUILTIN;
 closure color absorption() BUILTIN;
 
-// Backwards compatibility
-closure color bssrdf_cubic(normal N, vector radius) BUILTIN;
-closure color bssrdf_gaussian(normal N, vector radius) BUILTIN;
-closure color specular_toon(normal N, float size, float smooth) BUILTIN;
-
 // Renderer state
+int backfacing () BUILTIN;
 int raytype (string typename) BUILTIN;
 // the individual 'isFOOray' functions are deprecated
 int iscameraray () { return raytype("camera"); }
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index dbf59c60cb0..c13eae813d6 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -167,8 +167,8 @@ CCL_NAMESPACE_END
 #include "svm_math.h"
 #include "svm_mix.h"
 #include "svm_ramp.h"
-#include "svm_sepcomb_rgb.h"
 #include "svm_sepcomb_hsv.h"
+#include "svm_sepcomb_vector.h"
 #include "svm_musgrave.h"
 #include "svm_sky.h"
 #include "svm_tex_coord.h"
@@ -236,7 +236,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 				if(stack_load_float(stack, node.z) == 1.0f)
 					offset += node.y;
 				break;
-#ifdef __IMAGE_TEXTURES__
+#ifdef __TEXTURES__
 			case NODE_TEX_IMAGE:
 				svm_node_tex_image(kg, sd, stack, node);
 				break;
@@ -246,8 +246,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_TEX_ENVIRONMENT:
 				svm_node_tex_environment(kg, sd, stack, node);
 				break;
-#endif
-#ifdef __PROCEDURAL_TEXTURES__
 			case NODE_TEX_SKY:
 				svm_node_tex_sky(kg, sd, stack, node, &offset);
 				break;
@@ -327,11 +325,11 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_MIX:
 				svm_node_mix(kg, sd, stack, node.y, node.z, node.w, &offset);
 				break;
-			case NODE_SEPARATE_RGB:
-				svm_node_separate_rgb(sd, stack, node.y, node.z, node.w);
+			case NODE_SEPARATE_VECTOR:
+				svm_node_separate_vector(sd, stack, node.y, node.z, node.w);
 				break;
-			case NODE_COMBINE_RGB:
-				svm_node_combine_rgb(sd, stack, node.y, node.z, node.w);
+			case NODE_COMBINE_VECTOR:
+				svm_node_combine_vector(sd, stack, node.y, node.z, node.w);
 				break;
 			case NODE_SEPARATE_HSV:
 				svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
@@ -407,12 +405,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 				break;
 			case NODE_CLOSURE_SET_NORMAL:
 				svm_node_set_normal(kg, sd, stack, node.y, node.z );
-				break;			
-#endif
-			case NODE_EMISSION_SET_WEIGHT_TOTAL:
-				svm_node_emission_set_weight_total(kg, sd, node.y, node.z, node.w);
 				break;
-#ifdef __EXTRA_NODES__
 			case NODE_RGB_RAMP:
 				svm_node_rgb_ramp(kg, sd, stack, node, &offset);
 				break;
@@ -425,17 +418,13 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_LIGHT_FALLOFF:
 				svm_node_light_falloff(sd, stack, node);
 				break;
-#endif			
-#ifdef __ANISOTROPIC__
+#endif
 			case NODE_TANGENT:
 				svm_node_tangent(kg, sd, stack, node);
 				break;
-#endif			
-#ifdef __NORMAL_MAP__
 			case NODE_NORMAL_MAP:
 				svm_node_normal_map(kg, sd, stack, node);
-				break;
-#endif			
+				break;	
 			case NODE_END:
 			default:
 				return;
diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h
index 63dbf27d35e..1e40e868e14 100644
--- a/intern/cycles/kernel/svm/svm_blackbody.h
+++ b/intern/cycles/kernel/svm/svm_blackbody.h
@@ -42,7 +42,7 @@ ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *sta
 	/* Input */
 	float temperature = stack_load_float(stack, temperature_offset);
 
-	if (temperature < BB_DRAPPER) {
+	if (temperature < BB_DRAPER) {
 		/* just return very very dim red */
 		color_rgb = make_float3(1.0e-6f,0.0f,0.0f);
 	}
@@ -53,9 +53,9 @@ ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *sta
 
 		/* reconstruct a proper index for the table lookup, compared to OSL we don't look up two colors
 		just one (the OSL-lerp is also automatically done for us by "lookup_table_read") */
-		float t = powf((temperature - BB_DRAPPER) * (1.0f / BB_TABLE_SPACING), (1.0f / BB_TABLE_XPOWER));
+		float t = powf((temperature - BB_DRAPER) * (1.0f / BB_TABLE_SPACING), (1.0f / BB_TABLE_XPOWER));
 
-		int blackbody_table_offset = kernel_data.blackbody.table_offset;
+		int blackbody_table_offset = kernel_data.tables.blackbody_offset;
 
 		/* Retrieve colors from the lookup table */
 		float lutval = t*lookuptablenormalize;
diff --git a/intern/cycles/kernel/svm/svm_checker.h b/intern/cycles/kernel/svm/svm_checker.h
index 8d1a1a40449..e0408ad334a 100644
--- a/intern/cycles/kernel/svm/svm_checker.h
+++ b/intern/cycles/kernel/svm/svm_checker.h
@@ -21,9 +21,9 @@ CCL_NAMESPACE_BEGIN
 ccl_device_noinline float svm_checker(float3 p)
 {
 	/* avoid precision issues on unit coordinates */
-	p.x = (p.x + 0.00001f)*0.9999f;
-	p.y = (p.y + 0.00001f)*0.9999f;
-	p.z = (p.z + 0.00001f)*0.9999f;
+	p.x = (p.x + 0.000001f)*0.999999f;
+	p.y = (p.y + 0.000001f)*0.999999f;
+	p.z = (p.z + 0.000001f)*0.999999f;
 
 	int xi = float_to_int(fabsf(floorf(p.x)));
 	int yi = float_to_int(fabsf(floorf(p.y)));
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index a3770877544..30110db3ef9 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -24,6 +24,7 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 		if(refract) {
 			sc->data0 = eta;
 			sc->data1 = 0.0f;
+			sc->data2 = 0.0f;
 			sd->flag |= bsdf_refraction_setup(sc);
 		}
 		else
@@ -31,7 +32,8 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 	}
 	else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) {
 		sc->data0 = roughness;
-		sc->data1 = eta;
+		sc->data1 = roughness;
+		sc->data2 = eta;
 
 		if(refract)
 			sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
@@ -40,7 +42,8 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
 	}
 	else {
 		sc->data0 = roughness;
-		sc->data1 = eta;
+		sc->data1 = roughness;
+		sc->data2 = eta;
 
 		if(refract)
 			sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
@@ -135,11 +138,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				if(roughness == 0.0f) {
 					sc->data0 = 0.0f;
 					sc->data1 = 0.0f;
+					sc->data2 = 0.0f;
 					sd->flag |= bsdf_diffuse_setup(sc);
 				}
 				else {
 					sc->data0 = roughness;
 					sc->data1 = 0.0f;
+					sc->data2 = 0.0f;
 					sd->flag |= bsdf_oren_nayar_setup(sc);
 				}
 			}
@@ -151,6 +156,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(sc) {
 				sc->data0 = 0.0f;
 				sc->data1 = 0.0f;
+				sc->data2 = 0.0f;
 				sc->N = N;
 				sd->flag |= bsdf_translucent_setup(sc);
 			}
@@ -162,6 +168,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(sc) {
 				sc->data0 = 0.0f;
 				sc->data1 = 0.0f;
+				sc->data2 = 0.0f;
 				sc->N = N;
 				sd->flag |= bsdf_transparent_setup(sc);
 			}
@@ -169,9 +176,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		}
 		case CLOSURE_BSDF_REFLECTION_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
-		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: {
+		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
+		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: {
 #ifdef __CAUSTICS_TRICKS__
-			if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE))
+			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
 			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
@@ -179,15 +187,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(sc) {
 				sc->N = N;
 				sc->data0 = param1;
-				sc->data1 = 0.0f;
+				sc->data1 = param1;
+				sc->data2 = 0.0f;
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFLECTION_ID)
 					sd->flag |= bsdf_reflection_setup(sc);
 				else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
 					sd->flag |= bsdf_microfacet_beckmann_setup(sc);
-				else
+				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
 					sd->flag |= bsdf_microfacet_ggx_setup(sc);
+				else
+					sd->flag |= bsdf_ashikhmin_shirley_setup(sc);
 			}
 
 			break;
@@ -196,25 +207,35 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: {
 #ifdef __CAUSTICS_TRICKS__
-			if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE))
+			if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
 			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
 
 			if(sc) {
 				sc->N = N;
-				sc->data0 = param1;
 
 				float eta = fmaxf(param2, 1e-5f);
-				sc->data1 = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+				eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 				/* setup bsdf */
-				if(type == CLOSURE_BSDF_REFRACTION_ID)
+				if(type == CLOSURE_BSDF_REFRACTION_ID) {
+					sc->data0 = eta;
+					sc->data1 = 0.0f;
+					sc->data2 = 0.0f;
+
 					sd->flag |= bsdf_refraction_setup(sc);
-				else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
-					sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
-				else
-					sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
+				}
+				else {
+					sc->data0 = param1;
+					sc->data1 = param1;
+					sc->data2 = eta;
+
+					if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
+						sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
+					else
+						sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
+				}
 			}
 
 			break;
@@ -223,8 +244,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID: {
 #ifdef __CAUSTICS_TRICKS__
-			if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE))
+			if(!kernel_data.integrator.caustics_reflective &&
+			   !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) {
 				break;
+			}
 #endif
 			/* index of refraction */
 			float eta = fmaxf(param2, 1e-5f);
@@ -241,12 +264,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			float sample_weight = sc->sample_weight;
 
 			sc = svm_node_closure_get_bsdf(sd, mix_weight*fresnel);
-
-			if(sc) {
-				sc->N = N;
-				svm_node_glass_setup(sd, sc, type, eta, roughness, false);
+#ifdef __CAUSTICS_TRICKS__
+			if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0)
+#endif
+			{
+				if(sc) {
+					sc->N = N;
+					svm_node_glass_setup(sd, sc, type, eta, roughness, false);
+				}
 			}
 
+#ifdef __CAUSTICS_TRICKS__
+			if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
+				break;
+#endif
+
 			/* refraction */
 			sc = &sd->closure[sd->num_closure];
 			sc->weight = weight;
@@ -261,9 +293,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 			break;
 		}
-		case CLOSURE_BSDF_WARD_ID: {
+		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: {
 #ifdef __CAUSTICS_TRICKS__
-			if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE))
+			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
 			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
@@ -271,7 +305,6 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(sc) {
 				sc->N = N;
 
-#ifdef __ANISOTROPIC__
 				sc->T = stack_load_float3(stack, data_node.y);
 
 				/* rotate tangent */
@@ -293,10 +326,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->data1 = roughness/(1.0f - anisotropy);
 				}
 
-				sd->flag |= bsdf_ward_setup(sc);
-#else
-				sd->flag |= bsdf_diffuse_setup(sc);
-#endif
+				sc->data2 = 0.0f;
+
+				if (type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID)
+					sd->flag |= bsdf_microfacet_beckmann_aniso_setup(sc);
+				else if (type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID)
+					sd->flag |= bsdf_microfacet_ggx_aniso_setup(sc);
+				else
+					sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(sc);
 			}
 			break;
 		}
@@ -309,6 +346,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				/* sigma */
 				sc->data0 = clamp(param1, 0.0f, 1.0f);
 				sc->data1 = 0.0f;
+				sc->data2 = 0.0f;
 				sd->flag |= bsdf_ashikhmin_velvet_setup(sc);
 			}
 			break;
@@ -322,6 +360,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				sc->N = N;
 				sc->data0 = param1;
 				sc->data1 = param2;
+				sc->data2 = 0.0f;
 				
 				if (type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
 					sd->flag |= bsdf_diffuse_toon_setup(sc);
@@ -339,7 +378,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 				if(sc) {
 					/* todo: giving a fixed weight here will cause issues when
-					 * mixing multiple BSDFS. energey will not be conserved and
+					 * mixing multiple BSDFS. energy will not be conserved and
 					 * the throughput can blow up after multiple bounces. we
 					 * better figure out a way to skip backfaces from rays
 					 * spawned by transmission from the front */
@@ -356,11 +395,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->N = N;
 					sc->data0 = param1;
 					sc->data1 = param2;
-					sc->offset = -stack_load_float(stack, data_node.z);
+					sc->data2 = -stack_load_float(stack, data_node.z);
 
 					if(!(sd->type & PRIMITIVE_ALL_CURVE)) {
 						sc->T = normalize(sd->dPdv);
-						sc->offset = 0.0f;
+						sc->data2 = 0.0f;
 					}
 					else
 						sc->T = sd->dPdu;
@@ -405,6 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->sample_weight = sample_weight;
 					sc->data0 = radius.x;
 					sc->data1 = texture_blur;
+					sc->data2 = 0.0f;
 					sc->T.x = sharpness;
 #ifdef __OSL__
 					sc->prim = NULL;
@@ -421,6 +461,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->sample_weight = sample_weight;
 					sc->data0 = radius.y;
 					sc->data1 = texture_blur;
+					sc->data2 = 0.0f;
 					sc->T.x = sharpness;
 #ifdef __OSL__
 					sc->prim = NULL;
@@ -437,6 +478,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sc->sample_weight = sample_weight;
 					sc->data0 = radius.z;
 					sc->data1 = texture_blur;
+					sc->data2 = 0.0f;
 					sc->T.x = sharpness;
 #ifdef __OSL__
 					sc->prim = NULL;
@@ -582,16 +624,6 @@ ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint
 	svm_node_closure_store_weight(sd, weight);
 }
 
-ccl_device void svm_node_emission_set_weight_total(KernelGlobals *kg, ShaderData *sd, uint r, uint g, uint b)
-{
-	float3 weight = make_float3(__uint_as_float(r), __uint_as_float(g), __uint_as_float(b));
-
-	if(sd->object != OBJECT_NONE)
-		weight /= object_surface_area(kg, sd->object);
-
-	svm_node_closure_store_weight(sd, weight);
-}
-
 ccl_device void svm_node_closure_weight(ShaderData *sd, float *stack, uint weight_offset)
 {
 	float3 weight = stack_load_float3(stack, weight_offset);
@@ -603,14 +635,10 @@ ccl_device void svm_node_emission_weight(KernelGlobals *kg, ShaderData *sd, floa
 {
 	uint color_offset = node.y;
 	uint strength_offset = node.z;
-	uint total_power = node.w;
 
 	float strength = stack_load_float(stack, strength_offset);
 	float3 weight = stack_load_float3(stack, color_offset)*strength;
 
-	if(total_power && sd->object != OBJECT_NONE)
-		weight /= object_surface_area(kg, sd->object);
-
 	svm_node_closure_store_weight(sd, weight);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_convert.h b/intern/cycles/kernel/svm/svm_convert.h
index 2503912c5c6..b221e0728ec 100644
--- a/intern/cycles/kernel/svm/svm_convert.h
+++ b/intern/cycles/kernel/svm/svm_convert.h
@@ -45,13 +45,13 @@ ccl_device void svm_node_convert(ShaderData *sd, float *stack, uint type, uint f
 		}
 		case NODE_CONVERT_VF: {
 			float3 f = stack_load_float3(stack, from);
-			float g = (f.x + f.y + f.z)*(1.0f/3.0f);
+			float g = average(f);
 			stack_store_float(stack, to, g);
 			break;
 		}
 		case NODE_CONVERT_VI: {
 			float3 f = stack_load_float3(stack, from);
-			int i = (int)((f.x + f.y + f.z)*(1.0f/3.0f));
+			int i = (int)average(f);
 			stack_store_int(stack, to, i);
 			break;
 		}
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index daf7c6652d2..8a256c9bda5 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -134,8 +134,8 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 {
 #ifdef __KERNEL_CPU__
 #ifdef __KERNEL_SSE2__
-	__m128 r_m128;
-	float4 &r = (float4 &)r_m128;
+	ssef r_ssef;
+	float4 &r = (float4 &)r_ssef;
 	r = kernel_tex_image_interp(id, x, y);
 #else
 	float4 r = kernel_tex_image_interp(id, x, y);
@@ -252,9 +252,9 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 		case 96: r = kernel_tex_image_interp(__tex_image_096, x, y); break;
 		case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break;
 		case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break;
-		case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break;
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+		case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break;
 		case 100: r = kernel_tex_image_interp(__tex_image_100, x, y); break;
 		case 101: r = kernel_tex_image_interp(__tex_image_101, x, y); break;
 		case 102: r = kernel_tex_image_interp(__tex_image_102, x, y); break;
@@ -318,14 +318,14 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 	float alpha = r.w;
 
 	if(use_alpha && alpha != 1.0f && alpha != 0.0f) {
-		r_m128 = _mm_div_ps(r_m128, _mm_set1_ps(alpha));
+		r_ssef = r_ssef / ssef(alpha);
 		if(id >= TEX_NUM_FLOAT_IMAGES)
-			r_m128 = _mm_min_ps(r_m128, _mm_set1_ps(1.0f));
+			r_ssef = min(r_ssef, ssef(1.0f));
 		r.w = alpha;
 	}
 
 	if(srgb) {
-		r_m128 = color_srgb_to_scene_linear(r_m128);
+		r_ssef = color_srgb_to_scene_linear(r_ssef);
 		r.w = alpha;
 	}
 #else
@@ -435,6 +435,10 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
 			weight.z = ((2.0f - limit)*N.z + (limit - 1.0f))/(2.0f*limit - 1.0f);
 		}
 	}
+	else {
+		/* Desperate mode, no valid choice anyway, fallback to one side.*/
+		weight.x = 1.0f;
+	}
 
 	/* now fetch textures */
 	uint co_offset, out_offset, alpha_offset, srgb;
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index 91dda8972f9..c77c2a1c482 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -38,11 +38,11 @@ ccl_device int quick_floor(float x)
 	return float_to_int(x) - ((x < 0) ? 1 : 0);
 }
 #else
-ccl_device_inline __m128i quick_floor_sse(const __m128& x)
+ccl_device_inline ssei quick_floor_sse(const ssef& x)
 {
-	__m128i b = _mm_cvttps_epi32(x);
-	__m128i isneg = _mm_castps_si128(_mm_cmplt_ps(x, _mm_set1_ps(0.0f)));
-	return _mm_add_epi32(b, isneg); // unsaturated add 0xffffffff is the same as subtract -1
+	ssei b = truncatei(x);
+	ssei isneg = cast((x < ssef(0.0f)).m128);
+	return b + isneg; // unsaturated add 0xffffffff is the same as subtract -1
 }
 #endif
 
@@ -52,9 +52,9 @@ ccl_device float bits_to_01(uint bits)
 	return bits * (1.0f/(float)0xFFFFFFFF);
 }
 #else
-ccl_device_inline __m128 bits_to_01_sse(const __m128i& bits)
+ccl_device_inline ssef bits_to_01_sse(const ssei& bits)
 {
-	return _mm_mul_ps(uint32_to_float(bits), _mm_set1_ps(1.0f/(float)0xFFFFFFFF));
+	return uint32_to_float(bits) * ssef(1.0f/(float)0xFFFFFFFF);
 }
 #endif
 
@@ -88,16 +88,16 @@ ccl_device uint hash(uint kx, uint ky, uint kz)
 }
 
 #ifdef __KERNEL_SSE2__
-ccl_device_inline __m128i hash_sse(const __m128i& kx, const __m128i& ky, const __m128i& kz)
+ccl_device_inline ssei hash_sse(const ssei& kx, const ssei& ky, const ssei& kz)
 {
-#define rot(x,k) _mm_or_si128(_mm_slli_epi32((x), (k)), _mm_srli_epi32((x), 32-(k)))
-#define xor_rot(a, b, c) do {a = _mm_xor_si128(a, b); a = _mm_sub_epi32(a, rot(b, c));} while(0)
+#define rot(x,k) (((x)<<(k)) | (srl(x, 32-(k))))
+#define xor_rot(a, b, c) do {a = a^b; a = a - rot(b, c);} while(0)
 
 	uint len = 3;
-	__m128i magic = _mm_set1_epi32(0xdeadbeef + (len << 2) + 13);
-	__m128i a = _mm_add_epi32(magic, kx);
-	__m128i b = _mm_add_epi32(magic, ky);
-	__m128i c = _mm_add_epi32(magic, kz);
+	ssei magic = ssei(0xdeadbeef + (len << 2) + 13);
+	ssei a = magic + kx;
+	ssei b = magic + ky;
+	ssei c = magic + kz;
 
 	xor_rot(c, b, 14);
 	xor_rot(a, c, 11);
@@ -133,10 +133,10 @@ ccl_device float floorfrac(float x, int* i)
 	return x - *i;
 }
 #else
-ccl_device_inline __m128 floorfrac_sse(const __m128& x, __m128i *i)
+ccl_device_inline ssef floorfrac_sse(const ssef& x, ssei *i)
 {
 	*i = quick_floor_sse(x);
-	return _mm_sub_ps(x, _mm_cvtepi32_ps(*i));
+	return x - ssef(*i);
 }
 #endif
 
@@ -146,11 +146,11 @@ ccl_device float fade(float t)
 	return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f);
 }
 #else
-ccl_device_inline __m128 fade_sse(const __m128 *t)
+ccl_device_inline ssef fade_sse(const ssef *t)
 {
-	__m128 a = fma(*t, _mm_set1_ps(6.0f), _mm_set1_ps(-15.0f));
-	__m128 b = fma(*t, a, _mm_set1_ps(10.0f));
-	return _mm_mul_ps(_mm_mul_ps(*t, *t), _mm_mul_ps(*t, b));
+	ssef a = madd(*t, ssef(6.0f), ssef(-15.0f));
+	ssef b = madd(*t, a, ssef(10.0f));
+	return ((*t) * (*t)) * ((*t) * b);
 }
 #endif
 
@@ -160,10 +160,10 @@ ccl_device float nerp(float t, float a, float b)
 	return (1.0f - t) * a + t * b;
 }
 #else
-ccl_device_inline __m128 nerp_sse(const __m128& t, const __m128& a, const __m128& b)
+ccl_device_inline ssef nerp_sse(const ssef& t, const ssef& a, const ssef& b)
 {
-	__m128 x1 = _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(1.0f), t), a);
-	return fma(t, b, x1);
+	ssef x1 = (ssef(1.0f) - t) * a;
+	return madd(t, b, x1);
 }
 #endif
 
@@ -178,35 +178,35 @@ ccl_device float grad(int hash, float x, float y, float z)
 	return ((h&1) ? -u : u) + ((h&2) ? -v : v);
 }
 #else
-ccl_device_inline __m128 grad_sse(const __m128i& hash, const __m128& x, const __m128& y, const __m128& z)
+ccl_device_inline ssef grad_sse(const ssei& hash, const ssef& x, const ssef& y, const ssef& z)
 {
-	__m128i c1 = _mm_set1_epi32(1);
-	__m128i c2 = _mm_set1_epi32(2);
+	ssei c1 = ssei(1);
+	ssei c2 = ssei(2);
 
-	__m128i h = _mm_and_si128(hash, _mm_set1_epi32(15));          // h = hash & 15
+	ssei h = hash & ssei(15);                             // h = hash & 15
 
-	__m128i case_ux = _mm_cmplt_epi32(h, _mm_set1_epi32(8));       // 0xffffffff if h < 8 else 0
+	sseb case_ux = h < ssei(8);                           // 0xffffffff if h < 8 else 0
 
-	__m128 u = blend(_mm_castsi128_ps(case_ux), x, y);             // u = h<8 ? x : y
+	ssef u = select(case_ux, x, y);                       // u = h<8 ? x : y
 
-	__m128i case_vy = _mm_cmplt_epi32(h, _mm_set1_epi32(4));       // 0xffffffff if h < 4 else 0
+	sseb case_vy = h < ssei(4);                           // 0xffffffff if h < 4 else 0
 
-	__m128i case_h12 = _mm_cmpeq_epi32(h, _mm_set1_epi32(12));     // 0xffffffff if h == 12 else 0
-	__m128i case_h14 = _mm_cmpeq_epi32(h, _mm_set1_epi32(14));     // 0xffffffff if h == 14 else 0
+	sseb case_h12 = h == ssei(12);                        // 0xffffffff if h == 12 else 0
+	sseb case_h14 = h == ssei(14);                        // 0xffffffff if h == 14 else 0
 
-	__m128i case_vx = _mm_or_si128(case_h12, case_h14);            // 0xffffffff if h == 12 or h == 14 else 0
+	sseb case_vx = case_h12 | case_h14;                   // 0xffffffff if h == 12 or h == 14 else 0
 
-	__m128 v = blend(_mm_castsi128_ps(case_vy), y, blend(_mm_castsi128_ps(case_vx), x, z)); // v = h<4 ? y : h == 12 || h == 14 ? x : z
+	ssef v = select(case_vy, y, select(case_vx, x, z));   // v = h<4 ? y : h == 12 || h == 14 ? x : z
 
-	__m128i case_uneg = _mm_slli_epi32(_mm_and_si128(h, c1), 31);  // 1<<31 if h&1 else 0
-	__m128 case_uneg_mask = _mm_castsi128_ps(case_uneg);           // -0.0 if h&1 else +0.0
-	__m128 ru = _mm_xor_ps(u, case_uneg_mask);                     // -u if h&1 else u (copy float sign)
+	ssei case_uneg = (h & c1) << 31;                      // 1<<31 if h&1 else 0
+	ssef case_uneg_mask = cast(case_uneg);                // -0.0 if h&1 else +0.0
+	ssef ru = u ^ case_uneg_mask;                         // -u if h&1 else u (copy float sign)
 
-	__m128i case_vneg = _mm_slli_epi32(_mm_and_si128(h, c2), 30);  // 2<<30 if h&2 else 0
-	__m128 case_vneg_mask = _mm_castsi128_ps(case_vneg);           // -0.0 if h&2 else +0.0
-	__m128 rv = _mm_xor_ps(v, case_vneg_mask);                     // -v if h&2 else v (copy float sign)
+	ssei case_vneg = (h & c2) << 30;                      // 2<<30 if h&2 else 0
+	ssef case_vneg_mask = cast(case_vneg);                // -0.0 if h&2 else +0.0
+	ssef rv = v ^ case_vneg_mask;                         // -v if h&2 else v (copy float sign)
 
-	__m128 r = _mm_add_ps(ru, rv);                                 // ((h&1) ? -u : u) + ((h&2) ? -v : v)
+	ssef r = ru + rv;                                     // ((h&1) ? -u : u) + ((h&2) ? -v : v)
 	return r;
 }
 #endif
@@ -217,9 +217,9 @@ ccl_device float scale3(float result)
 	return 0.9820f * result;
 }
 #else
-ccl_device_inline __m128 scale3_sse(const __m128& result)
+ccl_device_inline ssef scale3_sse(const ssef& result)
 {
-	return _mm_mul_ps(_mm_set1_ps(0.9820f), result);
+	return ssef(0.9820f) * result;
 }
 #endif
 
@@ -252,75 +252,41 @@ ccl_device_noinline float perlin(float x, float y, float z)
 #else
 ccl_device_noinline float perlin(float x, float y, float z)
 {
-	__m128 xyz = _mm_setr_ps(x, y, z, 0.0f);
-	__m128i XYZ;
+	ssef xyz = ssef(x, y, z, 0.0f);
+	ssei XYZ;
 
-	__m128 fxyz = floorfrac_sse(xyz, &XYZ);
+	ssef fxyz = floorfrac_sse(xyz, &XYZ);
 
-	__m128 uvw = fade_sse(&fxyz);
-	__m128 u = broadcast<0>(uvw), v = broadcast<1>(uvw), w = broadcast<2>(uvw);
+	ssef uvw = fade_sse(&fxyz);
+	ssef u = shuffle<0>(uvw), v = shuffle<1>(uvw), w = shuffle<2>(uvw);
 
-	__m128i XYZ_ofc = _mm_add_epi32(XYZ, _mm_set1_epi32(1));
-	__m128i vdy = shuffle<1, 1, 1, 1>(XYZ, XYZ_ofc);                      // +0, +0, +1, +1
-	__m128i vdz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ_ofc)); // +0, +1, +0, +1
+	ssei XYZ_ofc = XYZ + ssei(1);
+	ssei vdy = shuffle<1, 1, 1, 1>(XYZ, XYZ_ofc);                      // +0, +0, +1, +1
+	ssei vdz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ_ofc)); // +0, +1, +0, +1
 
-	__m128i h1 = hash_sse(broadcast<0>(XYZ),     vdy, vdz);               // hash directions 000, 001, 010, 011
-	__m128i h2 = hash_sse(broadcast<0>(XYZ_ofc), vdy, vdz);               // hash directions 100, 101, 110, 111
+	ssei h1 = hash_sse(shuffle<0>(XYZ),     vdy, vdz);               // hash directions 000, 001, 010, 011
+	ssei h2 = hash_sse(shuffle<0>(XYZ_ofc), vdy, vdz);               // hash directions 100, 101, 110, 111
 
-	__m128 fxyz_ofc = _mm_sub_ps(fxyz, _mm_set1_ps(1.0f));
-	__m128 vfy = shuffle<1, 1, 1, 1>(fxyz, fxyz_ofc);
-	__m128 vfz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz_ofc));
+	ssef fxyz_ofc = fxyz - ssef(1.0f);
+	ssef vfy = shuffle<1, 1, 1, 1>(fxyz, fxyz_ofc);
+	ssef vfz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz_ofc));
 
-	__m128 g1 = grad_sse(h1, broadcast<0>(fxyz),     vfy, vfz);
-	__m128 g2 = grad_sse(h2, broadcast<0>(fxyz_ofc), vfy, vfz);
-	__m128 n1 = nerp_sse(u, g1, g2);
+	ssef g1 = grad_sse(h1, shuffle<0>(fxyz),     vfy, vfz);
+	ssef g2 = grad_sse(h2, shuffle<0>(fxyz_ofc), vfy, vfz);
+	ssef n1 = nerp_sse(u, g1, g2);
 
-	__m128 n1_half = shuffle<2, 3, 2, 3>(n1);      // extract 2 floats to a separate vector
-	__m128 n2 = nerp_sse(v, n1, n1_half);          // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _]
+	ssef n1_half = shuffle<2, 3, 2, 3>(n1);      // extract 2 floats to a separate vector
+	ssef n2 = nerp_sse(v, n1, n1_half);          // process nerp([a, b, _, _], [c, d, _, _]) -> [a', b', _, _]
 
-	__m128 n2_second = broadcast<1>(n2);           // extract b to a separate vector
-	__m128 result = nerp_sse(w, n2, n2_second);    // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _]
+	ssef n2_second = shuffle<1>(n2);           // extract b to a separate vector
+	ssef result = nerp_sse(w, n2, n2_second);    // process nerp([a', _, _, _], [b', _, _, _]) -> [a'', _, _, _]
 
-	__m128 r = scale3_sse(result);
+	ssef r = scale3_sse(result);
 
-	__m128 infmask = _mm_castsi128_ps(_mm_set1_epi32(0x7f800000));
-	__m128 rinfmask = _mm_cmpeq_ps(_mm_and_ps(r, infmask), infmask); // 0xffffffff if r is inf/-inf/nan else 0
-	__m128 rfinite = _mm_andnot_ps(rinfmask, r);   // 0 if r is inf/-inf/nan else r
-	return _mm_cvtss_f32(rfinite);
-}
-#endif
-
-#if 0 // unused
-ccl_device_noinline float perlin_periodic(float x, float y, float z, float3 pperiod)
-{
-	int X; float fx = floorfrac(x, &X);
-	int Y; float fy = floorfrac(y, &Y);
-	int Z; float fz = floorfrac(z, &Z);
-
-	int3 p;
-
-	p.x = max(quick_floor(pperiod.x), 1);
-	p.y = max(quick_floor(pperiod.y), 1);
-	p.z = max(quick_floor(pperiod.z), 1);
-
-	float u = fade(fx);
-	float v = fade(fy);
-	float w = fade(fz);
-
-	float result;
-
-	result = nerp (w, nerp (v, nerp (u, grad (phash (X  , Y  , Z  , p), fx	 , fy	 , fz	  ),
-										grad (phash (X+1, Y  , Z  , p), fx-1.0f, fy	 , fz	  )),
-							   nerp (u, grad (phash (X  , Y+1, Z  , p), fx	 , fy-1.0f, fz	  ),
-										grad (phash (X+1, Y+1, Z  , p), fx-1.0f, fy-1.0f, fz	  ))),
-					  nerp (v, nerp (u, grad (phash (X  , Y  , Z+1, p), fx	 , fy	 , fz-1.0f ),
-										grad (phash (X+1, Y  , Z+1, p), fx-1.0f, fy	 , fz-1.0f )),
-							   nerp (u, grad (phash (X  , Y+1, Z+1, p), fx	 , fy-1.0f, fz-1.0f ),
-										grad (phash (X+1, Y+1, Z+1, p), fx-1.0f, fy-1.0f, fz-1.0f ))));
-	float r = scale3(result);
-
-	/* can happen for big coordinates, things even out to 0.0 then anyway */
-	return (isfinite(r))? r: 0.0f;
+	ssef infmask = cast(ssei(0x7f800000));
+	ssef rinfmask = ((r & infmask) == infmask).m128; // 0xffffffff if r is inf/-inf/nan else 0
+	ssef rfinite = andnot(rinfmask, r);              // 0 if r is inf/-inf/nan else r
+	return extract<0>(rfinite);
 }
 #endif
 
@@ -357,30 +323,15 @@ ccl_device float3 cellnoise_color(float3 p)
 	return make_float3(r, g, b);
 }
 #else
-ccl_device __m128 cellnoise_color(const __m128& p)
+ccl_device ssef cellnoise_color(const ssef& p)
 {
-	__m128i ip = quick_floor_sse(p);
-	__m128i ip_yxz = shuffle<1, 0, 2, 3>(ip);
-	__m128i ip_xyy = shuffle<0, 1, 1, 3>(ip);
-	__m128i ip_zzx = shuffle<2, 2, 0, 3>(ip);
+	ssei ip = quick_floor_sse(p);
+	ssei ip_yxz = shuffle<1, 0, 2, 3>(ip);
+	ssei ip_xyy = shuffle<0, 1, 1, 3>(ip);
+	ssei ip_zzx = shuffle<2, 2, 0, 3>(ip);
 	return bits_to_01_sse(hash_sse(ip_xyy, ip_yxz, ip_zzx));
 }
 #endif
 
-#if 0 // unused
-/* periodic perlin noise in range 0..1 */
-ccl_device float pnoise(float3 p, float3 pperiod)
-{
-	float r = perlin_periodic(p.x, p.y, p.z, pperiod);
-	return 0.5f*r + 0.5f;
-}
-
-/* periodic perlin noise in range -1..1 */
-ccl_device float psnoise(float3 p, float3 pperiod)
-{
-	return perlin_periodic(p.x, p.y, p.z, pperiod);
-}
-#endif
-
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_rgb.h b/intern/cycles/kernel/svm/svm_sepcomb_rgb.h
deleted file mode 100644
index 34c4449ecdb..00000000000
--- a/intern/cycles/kernel/svm/svm_sepcomb_rgb.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void svm_node_combine_rgb(ShaderData *sd, float *stack, uint in_offset, uint color_index, uint out_offset)
-{
-	float color = stack_load_float(stack, in_offset);
-
-	if (stack_valid(out_offset))
-		stack_store_float(stack, out_offset+color_index, color);
-}
-
-ccl_device void svm_node_separate_rgb(ShaderData *sd, float *stack, uint icolor_offset, uint color_index, uint out_offset)
-{
-	float3 color = stack_load_float3(stack, icolor_offset);
-
-	if (stack_valid(out_offset)) {
-		if (color_index == 0)
-			stack_store_float(stack, out_offset, color.x);
-		else if (color_index == 1)
-			stack_store_float(stack, out_offset, color.y);
-		else
-			stack_store_float(stack, out_offset, color.z);
-	}
-}
-
-CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_vector.h b/intern/cycles/kernel/svm/svm_sepcomb_vector.h
new file mode 100644
index 00000000000..c8e7e34f87d
--- /dev/null
+++ b/intern/cycles/kernel/svm/svm_sepcomb_vector.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Vector combine / separate, used for the RGB and XYZ nodes */
+
+ccl_device void svm_node_combine_vector(ShaderData *sd, float *stack, uint in_offset, uint vector_index, uint out_offset)
+{
+	float vector = stack_load_float(stack, in_offset);
+
+	if (stack_valid(out_offset))
+		stack_store_float(stack, out_offset+vector_index, vector);
+}
+
+ccl_device void svm_node_separate_vector(ShaderData *sd, float *stack, uint ivector_offset, uint vector_index, uint out_offset)
+{
+	float3 vector = stack_load_float3(stack, ivector_offset);
+
+	if (stack_valid(out_offset)) {
+		if (vector_index == 0)
+			stack_store_float(stack, out_offset, vector.x);
+		else if (vector_index == 1)
+			stack_store_float(stack, out_offset, vector.y);
+		else
+			stack_store_float(stack, out_offset, vector.z);
+	}
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/svm/svm_texture.h b/intern/cycles/kernel/svm/svm_texture.h
index 5fd9204cbf6..d97c85db36a 100644
--- a/intern/cycles/kernel/svm/svm_texture.h
+++ b/intern/cycles/kernel/svm/svm_texture.h
@@ -140,15 +140,15 @@ ccl_device float voronoi_F1_distance(float3 p)
 		}
 	}
 #else
-	__m128 vec_p = load_m128(p);
-	__m128i xyzi = quick_floor_sse(vec_p);
+	ssef vec_p = load4f(p);
+	ssei xyzi = quick_floor_sse(vec_p);
 
 	for (int xx = -1; xx <= 1; xx++) {
 		for (int yy = -1; yy <= 1; yy++) {
 			for (int zz = -1; zz <= 1; zz++) {
-				__m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0)));
-				__m128 vp = _mm_add_ps(ip, cellnoise_color(ip));
-				float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp));
+				ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
+				ssef vp = ip + cellnoise_color(ip);
+				float d = len_squared<1, 1, 1, 0>(vec_p - vp);
 				da = min(d, da);
 			}
 		}
@@ -184,15 +184,15 @@ ccl_device float3 voronoi_F1_color(float3 p)
 
 	return cellnoise_color(pa);
 #else
-	__m128 pa, vec_p = load_m128(p);
-	__m128i xyzi = quick_floor_sse(vec_p);
+	ssef pa, vec_p = load4f(p);
+	ssei xyzi = quick_floor_sse(vec_p);
 
 	for (int xx = -1; xx <= 1; xx++) {
 		for (int yy = -1; yy <= 1; yy++) {
 			for (int zz = -1; zz <= 1; zz++) {
-				__m128 ip = _mm_cvtepi32_ps(_mm_add_epi32(xyzi, _mm_setr_epi32(xx, yy, zz, 0)));
-				__m128 vp = _mm_add_ps(ip, cellnoise_color(ip));
-				float d = len_squared<1, 1, 1, 0>(_mm_sub_ps(vec_p, vp));
+				ssef ip = ssef(xyzi + ssei(xx, yy, zz, 0));
+				ssef vp = ip + cellnoise_color(ip);
+				float d = len_squared<1, 1, 1, 0>(vec_p - vp);
 
 				if(d < da) {
 					da = d;
@@ -202,7 +202,7 @@ ccl_device float3 voronoi_F1_color(float3 p)
 		}
 	}
 
-	__m128 color = cellnoise_color(pa);
+	ssef color = cellnoise_color(pa);
 	return (float3 &)color;
 #endif
 }
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 80972ec82bc..fbe669c1fab 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -72,15 +72,14 @@ typedef enum NodeType {
 	NODE_TEX_COORD,
 	NODE_TEX_COORD_BUMP_DX,
 	NODE_TEX_COORD_BUMP_DY,
-	NODE_EMISSION_SET_WEIGHT_TOTAL,
 	NODE_ATTR_BUMP_DX,
 	NODE_ATTR_BUMP_DY,
 	NODE_TEX_ENVIRONMENT,
 	NODE_CLOSURE_HOLDOUT,
 	NODE_LAYER_WEIGHT,
 	NODE_CLOSURE_VOLUME,
-	NODE_SEPARATE_RGB,
-	NODE_COMBINE_RGB,
+	NODE_SEPARATE_VECTOR,
+	NODE_COMBINE_VECTOR,
 	NODE_SEPARATE_HSV,
 	NODE_COMBINE_HSV,
 	NODE_HSV,
@@ -349,7 +348,6 @@ typedef enum ClosureType {
 	/* Diffuse */
 	CLOSURE_BSDF_DIFFUSE_ID,
 	CLOSURE_BSDF_OREN_NAYAR_ID,
-	CLOSURE_BSDF_WESTIN_SHEEN_ID,
 	CLOSURE_BSDF_DIFFUSE_RAMP_ID,
 	CLOSURE_BSDF_DIFFUSE_TOON_ID,
 
@@ -358,9 +356,11 @@ typedef enum ClosureType {
 	CLOSURE_BSDF_REFLECTION_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_ID,
-	CLOSURE_BSDF_WARD_ID,
+	CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID,
+	CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID,
+	CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID,
+	CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID,
 	CLOSURE_BSDF_ASHIKHMIN_VELVET_ID,
-	CLOSURE_BSDF_WESTIN_BACKSCATTER_ID,
 	CLOSURE_BSDF_PHONG_RAMP_ID,
 	CLOSURE_BSDF_GLOSSY_TOON_ID,
 	CLOSURE_BSDF_HAIR_REFLECTION_ID,
@@ -404,7 +404,7 @@ typedef enum ClosureType {
 #define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
 #define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
 #define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID)
-#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type == CLOSURE_BSDF_WARD_ID)
+#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
 #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_GAUSSIAN_ID)
 #define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_GAUSSIAN_ID)
 #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
author	Daniel Genrich <daniel.genrich@gmx.net>	2014-10-23 17:12:28 +0400
committer	Daniel Genrich <daniel.genrich@gmx.net>	2014-10-23 17:12:28 +0400
commit	9ff1ebed52e0f858a395eeea4caf89304e068b2d (patch)
tree	b05d0f4b229de61b088a128ad412dd7bba347928 /intern/cycles/kernel
parent	a2ed11c6eeab5fab8cb81e32e1c68fdafdd5dbbc (diff)
parent	eaaeae469968c5c78a5d7e6d202f1af00b382a79 (diff)