41 files changed, 563 insertions, 391 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 2ff6b53b0a5..8857f86890c 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -149,48 +149,61 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
 
 	# warn for other versions
-	if(CUDA_VERSION MATCHES "60")
+	if(CUDA_VERSION MATCHES "65")
 	else()
 		message(WARNING
 			"CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
-			"build may succeed but only CUDA 6.0 is officially supported")
+			"build may succeed but only CUDA 6.5 is officially supported")
 	endif()
 
 	# build for each arch
 	set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
 	set(cuda_cubins)
 
-	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
-		set(cuda_cubin kernel_${arch}.cubin)
+	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
+		if(${experimental})
+			set(cuda_extra_flags "-D__KERNEL_CUDA_EXPERIMENTAL__")
+			set(cuda_cubin kernel_experimental_${arch}.cubin)
+		else()
+			set(cuda_extra_flags "")
+			set(cuda_cubin kernel_${arch}.cubin)
+		endif()
 
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
 		set(cuda_math_flags "--use_fast_math")
 
-		if(CUDA_VERSION LESS 60 AND ${arch} MATCHES "sm_50")
-			message(WARNING "Can't build kernel for CUDA sm_50 architecture, skipping")
-		else()
-			add_custom_command(
-				OUTPUT ${cuda_cubin}
-				COMMAND ${CUDA_NVCC_EXECUTABLE}
-				        -arch=${arch}
-				        -m${CUDA_BITS}
-				        --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
-				        -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
-				        --ptxas-options="-v"
-				        ${cuda_arch_flags}
-				        ${cuda_version_flags}
-				        ${cuda_math_flags}
-				        -I${CMAKE_CURRENT_SOURCE_DIR}/../util
-				        -I${CMAKE_CURRENT_SOURCE_DIR}/svm
-				        -DCCL_NAMESPACE_BEGIN=
-				        -DCCL_NAMESPACE_END=
-				        -DNVCC
-
-				DEPENDS ${cuda_sources})
-
-			delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
-			list(APPEND cuda_cubins ${cuda_cubin})
-		endif()
+		add_custom_command(
+			OUTPUT ${cuda_cubin}
+			COMMAND ${CUDA_NVCC_EXECUTABLE}
+					-arch=${arch}
+					-m${CUDA_BITS}
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
+					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
+					--ptxas-options="-v"
+					${cuda_arch_flags}
+					${cuda_version_flags}
+					${cuda_math_flags}
+					${cuda_extra_flags}
+					-I${CMAKE_CURRENT_SOURCE_DIR}/../util
+					-I${CMAKE_CURRENT_SOURCE_DIR}/svm
+					-DCCL_NAMESPACE_BEGIN=
+					-DCCL_NAMESPACE_END=
+					-DNVCC
+
+			DEPENDS ${cuda_sources})
+
+		delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
+		list(APPEND cuda_cubins ${cuda_cubin})
+
+		unset(cuda_extra_flags)
+	endmacro()
+
+	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
+		# Compile regular kernel
+		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
+
+		# Compile experimental kernel
+		CYCLES_CUDA_KERNEL_ADD(${arch} TRUE)
 	endforeach()
 
 	add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript
index cfe12e8533d..5a9e57c5342 100644
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@@ -69,8 +69,8 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     cuda_major_minor = re.findall(r'release (\d+).(\d+)', output)[0]
     cuda_version = int(cuda_major_minor[0])*10 + int(cuda_major_minor[1])
 
-    if cuda_version != 60:
-        print("CUDA version %d.%d detected, build may succeed but only CUDA 6.0 is officially supported." % (cuda_version/10, cuda_version%10))
+    if cuda_version != 65:
+        print("CUDA version %d.%d detected, build may succeed but only CUDA 6.5 is officially supported." % (cuda_version/10, cuda_version%10))
 
     # nvcc flags
     nvcc_flags = "-m%s" % (bits)
@@ -83,30 +83,35 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
     last_cubin_file = None
 
+    configs = (("kernel_%s.cubin", ''),
+               ("kernel_experimental_%s.cubin", ' -D__KERNEL_CUDA_EXPERIMENTAL__'))
+
     # add command for each cuda architecture
     for arch in cuda_archs:
-        if cuda_version < 60 and arch == "sm_50":
-            print("Can't build kernel for CUDA sm_50 architecture, skipping")
-            continue
-
-        cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
-
-        if env['BF_CYCLES_CUDA_ENV']:
-            MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
-            command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, kernel_file, cubin_file)
-        else:
-            command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, kernel_file, cubin_file)
-
-        kernel.Command(cubin_file, 'kernel.cu', command)
-        kernel.Depends(cubin_file, dependencies)
-
-        kernel_binaries.append(cubin_file)
-        
-        if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']:
-            # trick to compile one kernel at a time to reduce memory usage
-            if last_cubin_file:
-                kernel.Depends(cubin_file, last_cubin_file)
-            last_cubin_file = cubin_file
+        for config in configs:
+            # TODO(sergey): Use dict instead ocouple in order to increase readability?
+            name = config[0]
+            extra_flags = config[1]
+
+            cubin_file = os.path.join(build_dir, name % arch)
+            current_flags = nvcc_flags + extra_flags
+
+            if env['BF_CYCLES_CUDA_ENV']:
+                MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
+                command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, current_flags, kernel_file, cubin_file)
+            else:
+                command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, current_flags, kernel_file, cubin_file)
+
+            kernel.Command(cubin_file, 'kernel.cu', command)
+            kernel.Depends(cubin_file, dependencies)
+
+            kernel_binaries.append(cubin_file)
+
+            if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']:
+                # trick to compile one kernel at a time to reduce memory usage
+                if last_cubin_file:
+                    kernel.Depends(cubin_file, last_cubin_file)
+                last_cubin_file = cubin_file
 
 Return('kernel_binaries')
 
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 81c239ea0c9..9961071c2ac 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -23,9 +23,7 @@
 #include "../closure/bsdf_reflection.h"
 #include "../closure/bsdf_refraction.h"
 #include "../closure/bsdf_transparent.h"
-#ifdef __ANISOTROPIC__
 #include "../closure/bsdf_ashikhmin_shirley.h"
-#endif
 #include "../closure/bsdf_westin.h"
 #include "../closure/bsdf_toon.h"
 #include "../closure/bsdf_hair.h"
@@ -94,13 +92,11 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader
 			label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
-#ifdef __ANISOTROPIC__
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
 			label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
-#endif
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
 			label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
@@ -190,12 +186,10 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
 				eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
-#ifdef __ANISOTROPIC__
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
 				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
-#endif
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
 				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
@@ -260,12 +254,10 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
 				eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
-#ifdef __ANISOTROPIC__
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
 				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
-#endif
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
 				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
@@ -348,12 +340,10 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
 			bsdf_microfacet_beckmann_blur(sc, roughness);
 			break;
-#ifdef __ANISOTROPIC__
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
 			bsdf_ashikhmin_shirley_blur(sc, roughness);
 			break;
-#endif
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
 			bsdf_ashikhmin_velvet_blur(sc, roughness);
 			break;
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index 6a5d0410e01..ad7864cb8ea 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -77,7 +77,7 @@ ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, c
 		NdotI = fmaxf(NdotI, 1e-6f);
 		NdotO = fmaxf(NdotO, 1e-6f);
 		float3 H = normalize(omega_in + I);
-		float HdotI = fmaxf(dot(H, I), 1e-6f);
+		float HdotI = fmaxf(fabsf(dot(H, I)), 1e-6f);
 		float HdotN = fmaxf(dot(H, N), 1e-6f);
 
 		float pump = 1.0f / fmaxf(1e-6f, (HdotI*fmaxf(NdotO, NdotI))); /* pump from original paper (first derivative disc., but cancels the HdotI in the pdf nicely) */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index df0644becee..a0c59e6cbc0 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -362,7 +362,7 @@ ccl_device_inline float3 microfacet_sample_stretched(
  * E. Heitz, Research Report 2014
  *
  * Anisotropy is only supported for reflection currently, but adding it for
- * tranmission is just a matter of copying code from reflection if needed. */
+ * transmission is just a matter of copying code from reflection if needed. */
 
 ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc)
 {
@@ -513,6 +513,10 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc, con
 	float cosHO = dot(Ht, I);
 	float cosHI = dot(Ht, omega_in);
 
+	/* those situations makes chi+ terms in eq. 33, 34 be zero */
+	if(dot(Ht, N) <= 0.0f || cosHO * cosNO <= 0.0f || cosHI * cosNI <= 0.0f)
+		return make_float3(0.0f, 0.0f, 0.0f);
+
 	float D, G1o, G1i;
 
 	/* eq. 33: first we calculate D(m) with m=Ht: */
@@ -862,7 +866,11 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc
 	float cosHO = dot(Ht, I);
 	float cosHI = dot(Ht, omega_in);
 
-	/* eq. 33: first we calculate D(m) with m=Ht: */
+	/* those situations makes chi+ terms in eq. 25, 27 be zero */
+	if(dot(Ht, N) <= 0.0f || cosHO * cosNO <= 0.0f || cosHI * cosNI <= 0.0f)
+		return make_float3(0.0f, 0.0f, 0.0f);
+
+	/* eq. 25: first we calculate D(m) with m=Ht: */
 	float alpha2 = alpha_x * alpha_y;
 	float cosThetaM = min(dot(N, Ht), 1.0f);
 	float cosThetaM2 = cosThetaM * cosThetaM;
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index b3dcb9dcc38..05816bac2c1 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -111,16 +111,20 @@ ccl_device float fresnel_dielectric_cos(float cosi, float eta)
 	return 1.0f; // TIR(no refracted component)
 }
 
-ccl_device float fresnel_conductor(float cosi, float eta, float k)
+#if 0
+ccl_device float3 fresnel_conductor(float cosi, const float3 eta, const float3 k)
 {
-	float tmp_f = eta * eta + k * k;
-	float tmp = tmp_f * cosi * cosi;
-	float Rparl2 = (tmp - (2.0f * eta * cosi) + 1)/
-	               (tmp + (2.0f * eta * cosi) + 1);
-	float Rperp2 = (tmp_f - (2.0f * eta * cosi) + cosi * cosi)/
-	               (tmp_f + (2.0f * eta * cosi) + cosi * cosi);
+	float3 cosi2 = make_float3(cosi*cosi);
+	float3 one = make_float3(1.0f, 1.0f, 1.0f);
+	float3 tmp_f = eta * eta + k * k;
+	float3 tmp = tmp_f * cosi2;
+	float3 Rparl2 = (tmp - (2.0f * eta * cosi) + one) /
+					(tmp + (2.0f * eta * cosi) + one);
+	float3 Rperp2 = (tmp_f - (2.0f * eta * cosi) + cosi2) /
+					(tmp_f + (2.0f * eta * cosi) + cosi2);
 	return(Rparl2 + Rperp2) * 0.5f;
 }
+#endif
 
 ccl_device float smooth_step(float edge0, float edge1, float x)
 {
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 863836ffcea..c4e9e2ababe 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -936,9 +936,10 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		sd->u = isect->u;
 		sd->v = 0.0f;
 #endif
-	
+
+		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
+
 		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-			tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
 			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
 		}
 		else {
@@ -950,7 +951,6 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 			float gd = isect->v;
 
 			if(gd != 0.0f) {
-				tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
 				sd->Ng = sd->Ng - gd * tg;
 				sd->Ng = normalize(sd->Ng);
 			}
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index 5ab0b731bdd..7409aa0d014 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -272,7 +272,11 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
 #endif
 
 	/* compute face normal */
-	float3 Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
+	float3 Ng;
+	if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
+	else
+		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
 
 	sd->Ng = Ng;
 	sd->N = Ng;
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index f2f35c2efd0..41e9d183a96 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -18,7 +18,7 @@
 /* Triangle Primitive
  *
  * Basic triangle with 3 vertices is used to represent mesh surfaces. For BVH
- * ray intersection we use a precomputed triangle storage to accelarate
+ * ray intersection we use a precomputed triangle storage to accelerate
  * intersection at the cost of more memory usage */
 
 CCL_NAMESPACE_BEGIN
@@ -117,21 +117,24 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, ShaderDat
 }
 
 /* normal on triangle  */
-ccl_device_inline float3 triangle_normal(KernelGlobals *kg, int prim)
+ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 {
 	/* load triangle vertices */
-	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
+	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
 
 	float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
 	float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
 	float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
 	
 	/* return normal */
-	return normalize(cross(v1 - v0, v2 - v0));
+	if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+		return normalize(cross(v2 - v0, v1 - v0));
+	else
+		return normalize(cross(v1 - v0, v2 - v0));
 }
 
 /* point and normal on triangle  */
-ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
+ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
 {
 	/* load triangle vertices */
 	float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, prim));
@@ -144,8 +147,14 @@ ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int prim, float
 	float t = 1.0f - u - v;
 	*P = (u*v0 + v*v1 + t*v2);
 
+	/* get object flags, instance-aware */
+	int object_flag = kernel_tex_fetch(__object_flag, object >= 0 ? object : ~object);
+
 	/* compute normal */
-	*Ng = normalize(cross(v1 - v0, v2 - v0));
+	if(object_flag & SD_NEGATIVE_SCALE_APPLIED)
+		*Ng = normalize(cross(v2 - v0, v1 - v0));
+	else
+		*Ng = normalize(cross(v1 - v0, v2 - v0));
 
 	/* shader`*/
 	*shader = __float_as_int(kernel_tex_fetch(__tri_shader, prim));
diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernel.cl
index 2e0a49435a8..4f20ef9ca15 100644
--- a/intern/cycles/kernel/kernel.cl
+++ b/intern/cycles/kernel/kernel.cl
@@ -115,7 +115,7 @@ __kernel void kernel_ocl_shader(
 	ccl_global type *name,
 #include "kernel_textures.h"
 
-	int type, int sx, int sw, int sample)
+	int type, int sx, int sw, int offset, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -140,7 +140,7 @@ __kernel void kernel_ocl_bake(
 	ccl_global type *name,
 #include "kernel_textures.h"
 
-	int type, int sx, int sw, int sample)
+	int type, int sx, int sw, int offset, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -153,6 +153,6 @@ __kernel void kernel_ocl_bake(
 	int x = sx + get_global_id(0);
 
 	if(x < sx + sw)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
 }
 
diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernel.cpp
index 42eb9a62518..fa2113fbb46 100644
--- a/intern/cycles/kernel/kernel.cpp
+++ b/intern/cycles/kernel/kernel.cpp
@@ -120,10 +120,10 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu
 
 /* Shader Evaluation */
 
-void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
+void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
 	else
 		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu
index 9208acc232e..d5b5293664c 100644
--- a/intern/cycles/kernel/kernel.cu
+++ b/intern/cycles/kernel/kernel.cu
@@ -146,7 +146,7 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int sample)
+kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
@@ -156,12 +156,12 @@ kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int s
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw, int sample)
+kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
 	if(x < sx + sw)
-		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
+		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x, offset, sample);
 }
 
 #endif
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 264e5e3e4d0..19e06b88797 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -41,7 +41,7 @@ void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i, int sample);
+	int type, int i, int offset, int sample);
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
@@ -51,7 +51,7 @@ void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
 void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i, int sample);
+	int type, int i, int offset, int sample);
 #endif
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
@@ -62,7 +62,7 @@ void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
 void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i, int sample);
+	int type, int i, int offset, int sample);
 #endif
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
@@ -73,7 +73,7 @@ void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *bu
 void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i, int sample);
+	int type, int i, int offset, int sample);
 #endif
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
@@ -84,7 +84,7 @@ void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buff
 void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i, int sample);
+	int type, int i, int offset, int sample);
 #endif
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
@@ -95,7 +95,7 @@ void kernel_cpu_avx2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
 void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i, int sample);
+	int type, int i, int offset, int sample);
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index b4f6dcdace9..b0efcdc66a7 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -32,10 +32,11 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
 		eval->transmission = make_float3(0.0f, 0.0f, 0.0f);
 		eval->transparent = make_float3(0.0f, 0.0f, 0.0f);
 		eval->subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		eval->scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		if(type == CLOSURE_BSDF_TRANSPARENT_ID)
 			eval->transparent = value;
-		else if(CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_PHASE(type))
+		else if(CLOSURE_IS_BSDF_DIFFUSE(type))
 			eval->diffuse = value;
 		else if(CLOSURE_IS_BSDF_GLOSSY(type))
 			eval->glossy = value;
@@ -43,6 +44,8 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
 			eval->transmission = value;
 		else if(CLOSURE_IS_BSDF_BSSRDF(type))
 			eval->subsurface = value;
+		else if(CLOSURE_IS_PHASE(type))
+			eval->scatter = value;
 	}
 	else
 		eval->diffuse = value;
@@ -51,11 +54,17 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
 #endif
 }
 
-ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
+/* TODO(sergey): This is just a workaround for annoying 6.5 compiler bug. */
+#if !defined(__KERNEL_CUDA__) || __CUDA_ARCH__ < 500
+ccl_device_inline
+#else
+ccl_device_noinline
+#endif
+void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
-		if(CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_PHASE(type))
+		if(CLOSURE_IS_BSDF_DIFFUSE(type))
 			eval->diffuse += value;
 		else if(CLOSURE_IS_BSDF_GLOSSY(type))
 			eval->glossy += value;
@@ -63,6 +72,8 @@ ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3
 			eval->transmission += value;
 		else if(CLOSURE_IS_BSDF_BSSRDF(type))
 			eval->subsurface += value;
+		else if(CLOSURE_IS_PHASE(type))
+			eval->scatter += value;
 
 		/* skipping transparent, this function is used by for eval(), will be zero then */
 	}
@@ -81,7 +92,8 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 			&& is_zero(eval->glossy)
 			&& is_zero(eval->transmission)
 			&& is_zero(eval->transparent)
-			&& is_zero(eval->subsurface);
+			&& is_zero(eval->subsurface)
+			&& is_zero(eval->scatter);
 	}
 	else
 		return is_zero(eval->diffuse);
@@ -98,6 +110,7 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value)
 		eval->glossy *= value;
 		eval->transmission *= value;
 		eval->subsurface *= value;
+		eval->scatter *= value;
 
 		/* skipping transparent, this function is used by for eval(), will be zero then */
 	}
@@ -111,7 +124,7 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value)
 /* Path Radiance
  *
  * We accumulate different render passes separately. After summing at the end
- * to get the combined result, it should be identical. We definte directly
+ * to get the combined result, it should be identical. We definite directly
  * visible as the first non-transparent hit, while indirectly visible are the
  * bounces after that. */
 
@@ -130,21 +143,25 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 		L->color_glossy = make_float3(0.0f, 0.0f, 0.0f);
 		L->color_transmission = make_float3(0.0f, 0.0f, 0.0f);
 		L->color_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->color_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->direct_diffuse = make_float3(0.0f, 0.0f, 0.0f);
 		L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f);
 		L->direct_transmission = make_float3(0.0f, 0.0f, 0.0f);
 		L->direct_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->direct_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->indirect_diffuse = make_float3(0.0f, 0.0f, 0.0f);
 		L->indirect_glossy = make_float3(0.0f, 0.0f, 0.0f);
 		L->indirect_transmission = make_float3(0.0f, 0.0f, 0.0f);
 		L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f);
 		L->path_glossy = make_float3(0.0f, 0.0f, 0.0f);
 		L->path_transmission = make_float3(0.0f, 0.0f, 0.0f);
 		L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->path_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->emission = make_float3(0.0f, 0.0f, 0.0f);
 		L->background = make_float3(0.0f, 0.0f, 0.0f);
@@ -174,14 +191,16 @@ ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throug
 			L->path_glossy = bsdf_eval->glossy*value;
 			L->path_transmission = bsdf_eval->transmission*value;
 			L->path_subsurface = bsdf_eval->subsurface*value;
+			L->path_scatter = bsdf_eval->scatter*value;
 
-			*throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface;
+			*throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface + L->path_scatter;
 			
 			L->direct_throughput = *throughput;
 		}
 		else {
 			/* transparent bounce before first hit, or indirectly visible through BSDF */
-			float3 sum = (bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->transparent + bsdf_eval->subsurface)*inverse_pdf;
+			float3 sum = (bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->transparent +
+						  bsdf_eval->subsurface + bsdf_eval->scatter) * inverse_pdf;
 			*throughput *= sum;
 		}
 	}
@@ -241,6 +260,7 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
 			L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
 			L->direct_transmission += throughput*bsdf_eval->transmission*shadow;
 			L->direct_subsurface += throughput*bsdf_eval->subsurface*shadow;
+			L->direct_scatter += throughput*bsdf_eval->scatter*shadow;
 
 			if(is_lamp) {
 				L->shadow.x += shadow.x*shadow_fac;
@@ -250,7 +270,7 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
 		}
 		else {
 			/* indirectly visible lighting after BSDF bounce */
-			float3 sum = bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->subsurface;
+			float3 sum = bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->subsurface + bsdf_eval->scatter;
 			L->indirect += throughput*sum*shadow;
 		}
 	}
@@ -291,12 +311,14 @@ ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
 		L->direct_glossy += L->path_glossy*L->direct_emission;
 		L->direct_transmission += L->path_transmission*L->direct_emission;
 		L->direct_subsurface += L->path_subsurface*L->direct_emission;
+		L->direct_scatter += L->path_scatter*L->direct_emission;
 
 		L->indirect = safe_divide_color(L->indirect, L->direct_throughput);
 		L->indirect_diffuse += L->path_diffuse*L->indirect;
 		L->indirect_glossy += L->path_glossy*L->indirect;
 		L->indirect_transmission += L->path_transmission*L->indirect;
 		L->indirect_subsurface += L->path_subsurface*L->indirect;
+		L->indirect_scatter += L->path_scatter*L->indirect;
 	}
 #endif
 }
@@ -309,6 +331,7 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
 		L->path_glossy = make_float3(0.0f, 0.0f, 0.0f);
 		L->path_transmission = make_float3(0.0f, 0.0f, 0.0f);
 		L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->path_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->direct_emission = make_float3(0.0f, 0.0f, 0.0f);
 		L->indirect = make_float3(0.0f, 0.0f, 0.0f);
@@ -327,8 +350,8 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 	if(L->use_light_pass) {
 		path_radiance_sum_indirect(L);
 
-		L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_subsurface + L->emission;
-		L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission + L->indirect_subsurface;
+		L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_subsurface + L->direct_scatter + L->emission;
+		L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission + L->indirect_subsurface + L->indirect_scatter;
 
 		if(!kernel_data.background.transparent)
 			L_direct += L->background;
@@ -344,11 +367,13 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 			L->direct_glossy = make_float3(0.0f, 0.0f, 0.0f);
 			L->direct_transmission = make_float3(0.0f, 0.0f, 0.0f);
 			L->direct_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+			L->direct_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 			L->indirect_diffuse = make_float3(0.0f, 0.0f, 0.0f);
 			L->indirect_glossy = make_float3(0.0f, 0.0f, 0.0f);
 			L->indirect_transmission = make_float3(0.0f, 0.0f, 0.0f);
 			L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f);
+			L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 			L->emission = make_float3(0.0f, 0.0f, 0.0f);
 		}
@@ -368,6 +393,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 				L->direct_glossy *= scale;
 				L->direct_transmission *= scale;
 				L->direct_subsurface *= scale;
+				L->direct_scatter *= scale;
 				L->emission *= scale;
 				L->background *= scale;
 			}
@@ -382,6 +408,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 				L->indirect_glossy *= scale;
 				L->indirect_transmission *= scale;
 				L->indirect_subsurface *= scale;
+				L->indirect_scatter *= scale;
 			}
 
 			/* Sum again, after clamping */
@@ -416,11 +443,13 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance
 	L->direct_glossy += L_sample->direct_glossy*fac;
 	L->direct_transmission += L_sample->direct_transmission*fac;
 	L->direct_subsurface += L_sample->direct_subsurface*fac;
+	L->direct_scatter += L_sample->direct_scatter*fac;
 
 	L->indirect_diffuse += L_sample->indirect_diffuse*fac;
 	L->indirect_glossy += L_sample->indirect_glossy*fac;
 	L->indirect_transmission += L_sample->indirect_transmission*fac;
 	L->indirect_subsurface += L_sample->indirect_subsurface*fac;
+	L->indirect_scatter += L_sample->indirect_scatter*fac;
 
 	L->emission += L_sample->emission*fac;
 	L->background += L_sample->background*fac;
diff --git a/intern/cycles/kernel/kernel_avx.cpp b/intern/cycles/kernel/kernel_avx.cpp
index d612a82b785..e7ff21a6f09 100644
--- a/intern/cycles/kernel/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernel_avx.cpp
@@ -68,10 +68,10 @@ void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float
 
 /* Shader Evaluate */
 
-void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
+void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
 	else
 		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernel_avx2.cpp
index 339421a002b..cb1662bbfbe 100644
--- a/intern/cycles/kernel/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernel_avx2.cpp
@@ -69,10 +69,10 @@ void kernel_cpu_avx2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
 
 /* Shader Evaluate */
 
-void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
+void kernel_cpu_avx2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
 	else
 		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index c2d14b7f835..dfbb49db7e6 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -172,7 +172,8 @@ ccl_device_inline float bake_clamp_mirror_repeat(float u)
 }
 #endif
 
-ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample)
+ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output,
+                                     ShaderEvalType type, int i, int offset, int sample)
 {
 	ShaderData sd;
 	uint4 in = input[i * 2];
@@ -197,7 +198,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	int num_samples = kernel_data.integrator.aa_samples;
 
 	/* random number generator */
-	RNG rng = cmj_hash(i, 0);
+	RNG rng = cmj_hash(offset + i, 0);
 
 #if 0
 	uint rng_state = cmj_hash(i, 0);
@@ -215,7 +216,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	int shader;
 	float3 P, Ng;
 
-	triangle_point_normal(kg, prim, u, v, &P, &Ng, &shader);
+	triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
 
 	/* dummy initilizations copied from SHADER_EVAL_DISPLACE */
 	float3 I = Ng;
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index 6b03abe9708..5c83358a56d 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -21,16 +21,22 @@ CCL_NAMESPACE_BEGIN
 ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
 {
 	float blades = kernel_data.cam.blades;
+	float2 bokeh;
 
 	if(blades == 0.0f) {
 		/* sample disk */
-		return concentric_sample_disk(u, v);
+		bokeh = concentric_sample_disk(u, v);
 	}
 	else {
 		/* sample polygon */
 		float rotation = kernel_data.cam.bladesrotation;
-		return regular_polygon_sample(blades, rotation, u, v);
+		bokeh = regular_polygon_sample(blades, rotation, u, v);
 	}
+
+	/* anamorphic lens bokeh */
+	bokeh.x *= kernel_data.cam.inv_aperture_ratio;
+
+	return bokeh;
 }
 
 ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
@@ -183,7 +189,8 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float
 
 		/* calculate orthonormal coordinates perpendicular to D */
 		float3 U, V;
-		make_orthonormals(D, &U, &V);
+		U = normalize(make_float3(1.0f, 0.0f, 0.0f) -  D.x * D);
+		V = normalize(cross(D, U));
 
 		/* update ray for effect of lens */
 		ray->P = U * lensuv.x + V * lensuv.y;
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index bda98b84da8..4b2bb723ab6 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -108,6 +108,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 			eval->glossy = make_float3(0.0f, 0.0f, 0.0f);
 		if(ls->shader & SHADER_EXCLUDE_TRANSMIT)
 			eval->transmission = make_float3(0.0f, 0.0f, 0.0f);
+		if(ls->shader & SHADER_EXCLUDE_SCATTER)
+			eval->scatter = make_float3(0.0f, 0.0f, 0.0f);
 	}
 #endif
 
@@ -187,7 +189,8 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
 		if(ls.shader & SHADER_EXCLUDE_ANY) {
 			if(((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
 			   ((ls.shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) ||
-			   ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)))
+			   ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
+			   ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
 				continue;
 		}
 #endif
@@ -231,7 +234,8 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta
 		if(((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
 		   ((shader & SHADER_EXCLUDE_GLOSSY) && (state->flag & PATH_RAY_GLOSSY)) ||
 		   ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
-		   ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)))
+		   ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) ||
+		   ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
 			return make_float3(0.0f, 0.0f, 0.0f);
 	}
 
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index 7a850844bf2..2a5b7689e57 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -14,6 +14,8 @@
  * limitations under the License
  */
 
+/* TODO(sergey): Consider moving portable ctz/clz stuff to util. */
+
 CCL_NAMESPACE_BEGIN
 
 /* "Correlated Multi-Jittered Sampling"
@@ -35,8 +37,16 @@ ccl_device_inline int cmj_fast_mod_pow2(int a, int b)
 /* a must be > 0 and b must be > 1 */
 ccl_device_inline int cmj_fast_div_pow2(int a, int b)
 {
-#if defined(__KERNEL_SSE2__) && !defined(_MSC_VER)
+	kernel_assert(a > 0);
+	kernel_assert(b > 1);
+#if defined(__KERNEL_SSE2__)
+#  ifdef _MSC_VER
+	unsigned long ctz;
+	_BitScanForward(&ctz, b);
+	return a >> ctz;
+#  else
 	return a >> __builtin_ctz(b);
+#  endif
 #else
 	return a/b;
 #endif
@@ -44,8 +54,15 @@ ccl_device_inline int cmj_fast_div_pow2(int a, int b)
 
 ccl_device_inline uint cmj_w_mask(uint w)
 {
-#if defined(__KERNEL_SSE2__) && !defined(_MSC_VER)
+	kernel_assert(w > 1);
+#if defined(__KERNEL_SSE2__)
+#  ifdef _MSC_VER
+	unsigned long leading_zero;
+	_BitScanReverse(&leading_zero, w);
+	return ((1 << (1 + leading_zero)) - 1);
+#  else
 	return ((1 << (32 - __builtin_clz(w))) - 1);
+#  endif
 #else
 	w |= w >> 1;
 	w |= w >> 2;
@@ -165,7 +182,8 @@ ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
 		smodm = cmj_fast_mod_pow2(s, m);
 	}
 	else {
-		sdivm = float_to_int(s * invm);
+		/* Doing s*inmv gives precision issues here. */
+		sdivm = s / m;
 		smodm = s - sdivm*m;
 	}
 
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 0adf9ed4666..e7f62f230f8 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -27,7 +27,7 @@ typedef struct LightSample {
 	float pdf;			/* light sampling probability density function */
 	float eval_fac;		/* intensity multiplier */
 	int object;			/* object id for triangle/curve lights */
-	int prim;			/* primitive id for triangle/curve ligths */
+	int prim;			/* primitive id for triangle/curve lights */
 	int shader;			/* shader id */
 	int lamp;			/* lamp id */
 	LightType type;		/* type of light */
@@ -208,8 +208,8 @@ ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3
 	return t*t/cos_pi;
 }
 
-ccl_device bool lamp_light_sample(KernelGlobals *kg, int lamp,
-	float randu, float randv, float3 P, LightSample *ls, bool for_volume)
+ccl_device void lamp_light_sample(KernelGlobals *kg, int lamp,
+	float randu, float randv, float3 P, LightSample *ls)
 {
 	float4 data0 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 0);
 	float4 data1 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 1);
@@ -224,11 +224,6 @@ ccl_device bool lamp_light_sample(KernelGlobals *kg, int lamp,
 	ls->v = randv;
 
 	if(type == LIGHT_DISTANT) {
-#ifdef __VOLUME__
-		if(for_volume)
-			return false;
-#endif
-
 		/* distant light */
 		float3 lightD = make_float3(data0.y, data0.z, data0.w);
 		float3 D = lightD;
@@ -249,11 +244,6 @@ ccl_device bool lamp_light_sample(KernelGlobals *kg, int lamp,
 	}
 #ifdef __BACKGROUND_MIS__
 	else if(type == LIGHT_BACKGROUND) {
-#ifdef __VOLUME__
-		if(for_volume)
-			return false;
-#endif
-
 		/* infinite area light (e.g. light dome or env light) */
 		float3 D = background_light_sample(kg, randu, randv, &ls->pdf);
 
@@ -309,8 +299,6 @@ ccl_device bool lamp_light_sample(KernelGlobals *kg, int lamp,
 		ls->eval_fac *= kernel_data.integrator.inv_pdf_lights;
 		ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 	}
-
-	return true;
 }
 
 ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
@@ -469,7 +457,7 @@ ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object,
 	v = randv*randu;
 
 	/* triangle, so get position, normal, shader */
-	triangle_point_normal(kg, prim, u, v, &ls->P, &ls->Ng, &ls->shader);
+	triangle_point_normal(kg, object, prim, u, v, &ls->P, &ls->Ng, &ls->shader);
 	ls->object = object;
 	ls->prim = prim;
 	ls->lamp = LAMP_NONE;
@@ -526,7 +514,7 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
 
 /* Generic Light */
 
-ccl_device bool light_sample(KernelGlobals *kg, float randt, float randu, float randv, float time, float3 P, LightSample *ls, bool for_volume)
+ccl_device void light_sample(KernelGlobals *kg, float randt, float randu, float randv, float time, float3 P, LightSample *ls)
 {
 	/* sample index */
 	int index = light_distribution_sample(kg, randt);
@@ -545,12 +533,10 @@ ccl_device bool light_sample(KernelGlobals *kg, float randt, float randu, float
 		ls->D = normalize_len(ls->P - P, &ls->t);
 		ls->pdf = triangle_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 		ls->shader |= shader_flag;
-
-		return true;
 	}
 	else {
 		int lamp = -prim-1;
-		return lamp_light_sample(kg, lamp, randu, randv, P, ls, for_volume);
+		lamp_light_sample(kg, lamp, randu, randv, P, ls);
 	}
 }
 
@@ -560,11 +546,6 @@ ccl_device int light_select_num_samples(KernelGlobals *kg, int index)
 	return __float_as_int(data3.x);
 }
 
-ccl_device bool light_select(KernelGlobals *kg, int index, float randu, float randv, float3 P, LightSample *ls, bool for_volume)
-{
-	return lamp_light_sample(kg, index, randu, randv, P, ls, for_volume);
-}
-
 ccl_device int lamp_light_eval_sample(KernelGlobals *kg, float randt)
 {
 	/* sample index */
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 0c033f6234c..515854b6e9c 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -89,6 +89,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
 
 			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+
+#ifdef __VOLUME_DECOUPLED__
 			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
 			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method);
 
@@ -135,28 +137,32 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, Ray ray,
 				kernel_volume_decoupled_free(kg, &volume_segment);
 
 				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray, 1.0f))
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray))
 						continue;
 					else
 						break;
 				}
 			}
-			else {
+			else
+#endif
+			{
 				/* integrate along volume segment with distance sampling */
 				ShaderData volume_sd;
 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &state, &volume_sd, &volume_ray, L, &throughput, rng);
+					kg, &state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous);
 
+#ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* direct lighting */
-					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, L, 1.0f);
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, L);
 
 					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray, 1.0f))
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, L, &ray))
 						continue;
 					else
 						break;
 				}
+#endif
 			}
 		}
 #endif
@@ -470,6 +476,8 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
 
 			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+
+#ifdef __VOLUME_DECOUPLED__
 			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
 			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
 
@@ -516,28 +524,32 @@ ccl_device float4 kernel_path_integrate(KernelGlobals *kg, RNG *rng, int sample,
 				kernel_volume_decoupled_free(kg, &volume_segment);
 
 				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray, 1.0f))
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
 						continue;
 					else
 						break;
 				}
 			}
-			else {
+			else 
+#endif
+			{
 				/* integrate along volume segment with distance sampling */
 				ShaderData volume_sd;
 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng);
+					kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous);
 
+#ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* direct lighting */
-					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L, 1.0f);
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L);
 
 					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray, 1.0f))
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
 						continue;
 					else
 						break;
 				}
+#endif
 			}
 		}
 #endif
@@ -803,10 +815,11 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		if(state.volume_stack[0].shader != SHADER_NONE) {
 			Ray volume_ray = ray;
 			volume_ray.t = (hit)? isect.t: FLT_MAX;
+			
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
 
-#ifdef __KERNEL_CPU__
+#ifdef __VOLUME_DECOUPLED__
 			/* decoupled ray marching only supported on CPU */
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
 
 			/* cache steps along volume for repeated sampling */
 			VolumeSegment volume_segment;
@@ -850,16 +863,17 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
 					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 						&ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
+						
+					(void)result;
+					kernel_assert(result == VOLUME_PATH_SCATTERED);
 
-					if(result == VOLUME_PATH_SCATTERED) {
-						if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
-							kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
+						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
 
-							/* for render passes, sum and reset indirect light pass variables
-							 * for the next samples */
-							path_radiance_sum_indirect(&L);
-							path_radiance_reset_indirect(&L);
-						}
+						/* for render passes, sum and reset indirect light pass variables
+						 * for the next samples */
+						path_radiance_sum_indirect(&L);
+						path_radiance_reset_indirect(&L);
 					}
 				}
 			}
@@ -883,21 +897,22 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				PathState ps = state;
 				Ray pray = ray;
 				ShaderData volume_sd;
-				float3 tp = throughput;
+				float3 tp = throughput * num_samples_inv;
 
 				/* branch RNG state */
 				path_state_branch(&ps, j, num_samples);
 
 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng);
+					kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous);
 				
+#ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* todo: support equiangular, MIS and all light sampling.
 					 * alternatively get decoupled ray marching working on the GPU */
-					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L, num_samples_inv);
+					kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L);
 
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray, num_samples_inv)) {
-						kernel_path_indirect(kg, rng, pray, tp*num_samples_inv, num_samples, ps, &L);
+					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &tp, &ps, &L, &pray)) {
+						kernel_path_indirect(kg, rng, pray, tp, num_samples, ps, &L);
 
 						/* for render passes, sum and reset indirect light pass variables
 						 * for the next samples */
@@ -905,6 +920,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 						path_radiance_reset_indirect(&L);
 					}
 				}
+#endif
 			}
 
 			/* todo: avoid this calculation using decoupled ray marching */
@@ -941,7 +957,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
 		/* holdout */
 #ifdef __HOLDOUT__
-		if((sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK))) {
+		if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) {
 			if(kernel_data.background.transparent) {
 				float3 holdout_weight;
 				
@@ -1031,6 +1047,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		ray.P = ray_offset(sd.P, -sd.Ng);
 		ray.t -= sd.ray_length; /* clipping works through transparent */
 
+
+#ifdef __RAY_DIFFERENTIALS__
+		ray.dP = sd.dP;
+		ray.dD.dx = -sd.dI.dx;
+		ray.dD.dy = -sd.dI.dy;
+#endif
+
 #ifdef __VOLUME__
 		/* enter/exit volume */
 		kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index 7700abfbdae..9553c2da0df 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -50,7 +50,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
 				LightSample ls;
-				light_select(kg, i, light_u, light_v, sd->P, &ls, false);
+				lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls);
 
 				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 					/* trace shadow ray */
@@ -82,7 +82,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 					light_t = 0.5f*light_t;
 
 				LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, false);
+				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
 
 				if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 					/* trace shadow ray */
@@ -103,7 +103,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
 		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 		LightSample ls;
-		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, false);
+		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
 
 		/* sample random light */
 		if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
@@ -200,7 +200,7 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 #endif
 
 	LightSample ls;
-	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, false);
+	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
 
 	if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
 		/* trace shadow ray */
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index 8453b79de45..da2d5e6eca8 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -16,11 +16,10 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __VOLUME__
+#ifdef __VOLUME_SCATTER__
 
 ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L,
-	float num_samples_adjust)
+	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L)
 {
 #ifdef __EMISSION__
 	if(!kernel_data.integrator.use_direct_light)
@@ -41,9 +40,8 @@ ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
 	light_ray.time = sd->time;
 #endif
 
-	if(!light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, true))
-		return;
-	else if(ls.pdf == 0.0f)
+	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
+	if(ls.pdf == 0.0f)
 		return;
 	
 	if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
@@ -52,15 +50,14 @@ ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
 
 		if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
 			/* accumulate */
-			path_radiance_accum_light(L, throughput * num_samples_adjust, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+			path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
 		}
 	}
 #endif
 }
 
 ccl_device bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray,
-	float num_samples_adjust)
+	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
 {
 	/* sample phase function */
 	float phase_pdf;
@@ -135,26 +132,22 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
 				LightSample ls;
-				if(!light_select(kg, i, light_u, light_v, ray->P, &ls, true))
-					continue;
+				lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls);		
 
 				float3 tp = throughput;
 
 				/* sample position on volume segment */
-				if(segment) {
-					float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
-					float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+				float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
+				float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
 
-					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-						state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+					state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+					
+				(void)result;
+				kernel_assert(result == VOLUME_PATH_SCATTERED);
 
-					if(result != VOLUME_PATH_SCATTERED)
-						continue;
-
-					/* todo: split up light_sample so we don't have to call it again with new position */
-					if(!light_select(kg, i, light_u, light_v, sd->P, &ls, true))
-						continue;
-				}
+				/* todo: split up light_sample so we don't have to call it again with new position */
+				lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls);
 
 				if(ls.pdf == 0.0f)
 					continue;
@@ -190,26 +183,22 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 					light_t = 0.5f*light_t;
 
 				LightSample ls;
-				if(!light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls, true))
-					continue;
+				light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls);
 
 				float3 tp = throughput;
 
 				/* sample position on volume segment */
-				if(segment) {
-					float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
-					float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+				float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
+				float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
 
-					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-						state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+					state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+					
+				(void)result;
+				kernel_assert(result == VOLUME_PATH_SCATTERED);
 
-					if(result != VOLUME_PATH_SCATTERED)
-						continue;
-
-					/* todo: split up light_sample so we don't have to call it again with new position */
-					if(!light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, true))
-						continue;
-				}
+				/* todo: split up light_sample so we don't have to call it again with new position */
+				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
 
 				if(ls.pdf == 0.0f)
 					continue;
@@ -233,26 +222,22 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 		LightSample ls;
-		if(!light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls, true))
-			return;
+		light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, &ls);
 
 		float3 tp = throughput;
 
 		/* sample position on volume segment */
-		if(segment) {
-			float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
-			float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+		float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+		float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
 
-			VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-				state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+		VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+			state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
+			
+		(void)result;
+		kernel_assert(result == VOLUME_PATH_SCATTERED);
 
-			if(result != VOLUME_PATH_SCATTERED)
-				return;
-
-			/* todo: split up light_sample so we don't have to call it again with new position */
-			if(!light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls, true))
-				return;
-		}
+		/* todo: split up light_sample so we don't have to call it again with new position */
+		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, &ls);
 
 		if(ls.pdf == 0.0f)
 			return;
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 842b9f68840..fc61f1a9c2c 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -86,7 +86,7 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 #endif
 	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* static triangle */
-		float3 Ng = triangle_normal(kg, sd->prim);
+		float3 Ng = triangle_normal(kg, sd);
 		sd->shader =  __float_as_int(kernel_tex_fetch(__tri_shader, sd->prim));
 
 		/* vectors */
@@ -165,7 +165,7 @@ ccl_device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderDat
 
 	/* fetch triangle data */
 	if(sd->type == PRIMITIVE_TRIANGLE) {
-		float3 Ng = triangle_normal(kg, sd->prim);
+		float3 Ng = triangle_normal(kg, sd);
 		sd->shader =  __float_as_int(kernel_tex_fetch(__tri_shader, sd->prim));
 
 		/* static triangle */
@@ -340,7 +340,7 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 	float3 P, Ng, I = make_float3(0.0f, 0.0f, 0.0f);
 	int shader;
 
-	triangle_point_normal(kg, prim, u, v, &P, &Ng, &shader);
+	triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
 
 	/* force smooth shading for displacement */
 	shader |= SHADER_SMOOTH_NORMAL;
diff --git a/intern/cycles/kernel/kernel_sse2.cpp b/intern/cycles/kernel/kernel_sse2.cpp
index 67bd0943b1b..740998e8c92 100644
--- a/intern/cycles/kernel/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernel_sse2.cpp
@@ -64,10 +64,10 @@ void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
 
 /* Shader Evaluate */
 
-void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
+void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
 	else
 		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
diff --git a/intern/cycles/kernel/kernel_sse3.cpp b/intern/cycles/kernel/kernel_sse3.cpp
index 40d621b66f6..da73a3a1c97 100644
--- a/intern/cycles/kernel/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernel_sse3.cpp
@@ -66,10 +66,10 @@ void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, floa
 
 /* Shader Evaluate */
 
-void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
+void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
 	else
 		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
diff --git a/intern/cycles/kernel/kernel_sse41.cpp b/intern/cycles/kernel/kernel_sse41.cpp
index 4b48d10b020..5704f60e138 100644
--- a/intern/cycles/kernel/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernel_sse41.cpp
@@ -67,10 +67,10 @@ void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, flo
 
 /* Shader Evaluate */
 
-void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
+void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
 	else
 		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index d81909a623a..933202ea498 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -66,6 +66,8 @@ CCL_NAMESPACE_BEGIN
 #define __SUBSURFACE__
 #define __CMJ__
 #define __VOLUME__
+#define __VOLUME_DECOUPLED__
+#define __VOLUME_SCATTER__
 #define __SHADOW_RECORD_ALL__
 #endif
 
@@ -73,10 +75,15 @@ CCL_NAMESPACE_BEGIN
 #define __KERNEL_SHADING__
 #define __KERNEL_ADV_SHADING__
 #define __BRANCHED_PATH__
+#define __VOLUME__
+#define __VOLUME_SCATTER__
 
 /* Experimental on GPU */
-//#define __VOLUME__
-//#define __SUBSURFACE__
+#ifdef __KERNEL_CUDA_EXPERIMENTAL__
+#define __SUBSURFACE__
+#define __CMJ__
+#endif
+
 #endif
 
 #ifdef __KERNEL_OPENCL__
@@ -103,7 +110,6 @@ CCL_NAMESPACE_BEGIN
 #define __BACKGROUND_MIS__
 #define __LAMP_MIS__
 #define __AO__
-#define __ANISOTROPIC__
 //#define __CAMERA_MOTION__
 //#define __OBJECT_MOTION__
 //#define __HAIR__
@@ -134,11 +140,9 @@ CCL_NAMESPACE_BEGIN
 #ifdef __KERNEL_SHADING__
 #define __SVM__
 #define __EMISSION__
-#define __PROCEDURAL_TEXTURES__
-#define __IMAGE_TEXTURES__
+#define __TEXTURES__
 #define __EXTRA_NODES__
 #define __HOLDOUT__
-#define __NORMAL_MAP__
 #endif
 
 #ifdef __KERNEL_ADV_SHADING__
@@ -148,7 +152,6 @@ CCL_NAMESPACE_BEGIN
 #define __BACKGROUND_MIS__
 #define __LAMP_MIS__
 #define __AO__
-#define __ANISOTROPIC__
 #define __CAMERA_MOTION__
 #define __OBJECT_MOTION__
 #define __HAIR__
@@ -223,10 +226,9 @@ enum PathTraceDimension {
 	PRNG_PHASE_V = 9,
 	PRNG_PHASE = 10,
 	PRNG_SCATTER_DISTANCE = 11,
-	PRNG_BOUNCE_NUM = 12,
-#else
-	PRNG_BOUNCE_NUM = 8,
 #endif
+
+	PRNG_BOUNCE_NUM = 12,
 };
 
 enum SamplingPattern {
@@ -252,17 +254,17 @@ enum PathRayFlag {
 	PATH_RAY_SHADOW_TRANSPARENT = 256,
 	PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
 
-	PATH_RAY_CURVE = 512, /* visibility flag to define curve segments*/
+	PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
+	PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
 
 	/* note that these can use maximum 12 bits, the other are for layers */
-	PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512),
+	PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024),
 
-	PATH_RAY_MIS_SKIP = 1024,
-	PATH_RAY_DIFFUSE_ANCESTOR = 2048,
-	PATH_RAY_GLOSSY_ANCESTOR = 4096,
-	PATH_RAY_BSSRDF_ANCESTOR = 8192,
-	PATH_RAY_SINGLE_PASS_DONE = 16384,
-	PATH_RAY_VOLUME_SCATTER = 32768,
+	PATH_RAY_MIS_SKIP = 2048,
+	PATH_RAY_DIFFUSE_ANCESTOR = 4096,
+	PATH_RAY_GLOSSY_ANCESTOR = 8192,
+	PATH_RAY_BSSRDF_ANCESTOR = 16384,
+	PATH_RAY_SINGLE_PASS_DONE = 32768,
 
 	/* we need layer member flags to be the 20 upper bits */
 	PATH_RAY_LAYER_SHIFT = (32-20)
@@ -332,21 +334,25 @@ typedef struct PathRadiance {
 	float3 color_glossy;
 	float3 color_transmission;
 	float3 color_subsurface;
+	float3 color_scatter;
 
 	float3 direct_diffuse;
 	float3 direct_glossy;
 	float3 direct_transmission;
 	float3 direct_subsurface;
+	float3 direct_scatter;
 
 	float3 indirect_diffuse;
 	float3 indirect_glossy;
 	float3 indirect_transmission;
 	float3 indirect_subsurface;
+	float3 indirect_scatter;
 
 	float3 path_diffuse;
 	float3 path_glossy;
 	float3 path_transmission;
 	float3 path_subsurface;
+	float3 path_scatter;
 
 	float4 shadow;
 	float mist;
@@ -360,6 +366,7 @@ typedef struct BsdfEval {
 	float3 transmission;
 	float3 transparent;
 	float3 subsurface;
+	float3 scatter;
 } BsdfEval;
 
 #else
@@ -380,7 +387,8 @@ typedef enum ShaderFlag {
 	SHADER_EXCLUDE_GLOSSY = (1 << 26),
 	SHADER_EXCLUDE_TRANSMIT = (1 << 25),
 	SHADER_EXCLUDE_CAMERA = (1 << 24),
-	SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE|SHADER_EXCLUDE_GLOSSY|SHADER_EXCLUDE_TRANSMIT|SHADER_EXCLUDE_CAMERA),
+	SHADER_EXCLUDE_SCATTER = (1 << 23),
+	SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE|SHADER_EXCLUDE_GLOSSY|SHADER_EXCLUDE_TRANSMIT|SHADER_EXCLUDE_CAMERA|SHADER_EXCLUDE_SCATTER),
 
 	SHADER_MASK = ~(SHADER_SMOOTH_NORMAL|SHADER_CAST_SHADOW|SHADER_AREA_LIGHT|SHADER_USE_MIS|SHADER_EXCLUDE_ANY)
 } ShaderFlag;
@@ -392,10 +400,8 @@ typedef enum LightType {
 	LIGHT_DISTANT,
 	LIGHT_BACKGROUND,
 	LIGHT_AREA,
-	LIGHT_AO,
 	LIGHT_SPOT,
-	LIGHT_TRIANGLE,
-	LIGHT_STRAND
+	LIGHT_TRIANGLE
 } LightType;
 
 /* Camera Type */
@@ -526,17 +532,15 @@ typedef struct ShaderClosure {
 	ClosureType type;
 	float3 weight;
 
-	float sample_weight;
-
 	float data0;
 	float data1;
+	float data2;
 
 	float3 N;
-#if defined(__ANISOTROPIC__) || defined(__SUBSURFACE__) || defined(__HAIR__)
 	float3 T;
-#endif
+	
+	float sample_weight;
 
-	float data2;
 #ifdef __OSL__
 	void *prim;
 #endif
@@ -596,7 +600,8 @@ enum ShaderDataFlag {
 	/* object flags */
 	SD_HOLDOUT_MASK = 524288,			/* holdout for camera rays */
 	SD_OBJECT_MOTION = 1048576,			/* has object motion blur */
-	SD_TRANSFORM_APPLIED = 2097152, 	/* vertices have transform applied */
+	SD_TRANSFORM_APPLIED = 2097152,		/* vertices have transform applied */
+	SD_NEGATIVE_SCALE_APPLIED = 4194304,	/* vertices have negative scale applied */
 
 	SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED)
 };
@@ -762,9 +767,12 @@ typedef struct KernelCamera {
 	/* render size */
 	float width, height;
 	int resolution;
+
+	/* anamorphic lens bokeh */
+	float inv_aperture_ratio;
+
 	int pad1;
 	int pad2;
-	int pad3;
 
 	/* more matrices */
 	Transform screentoworld;
@@ -866,7 +874,8 @@ typedef struct KernelIntegrator {
 	int transparent_shadows;
 
 	/* caustics */
-	int no_caustics;
+	int caustics_reflective;
+	int caustics_refractive;
 	float filter_glossy;
 
 	/* seed */
@@ -927,7 +936,6 @@ typedef enum CurveFlag {
 } CurveFlag;
 
 typedef struct KernelCurves {
-	/* strand intersect and normal parameters - many can be changed to flags */
 	int curveflags;
 	int subdivisions;
 
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index cc4b2e3edf1..ea02ede10cd 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -176,6 +176,8 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 	/* compute extinction at the start */
 	float t = 0.0f;
 
+	float3 sum = make_float3(0.0f, 0.0f, 0.0f);
+
 	for(int i = 0; i < max_steps; i++) {
 		/* advance to new position */
 		float new_t = min(ray->t, (i+1) * step);
@@ -190,20 +192,26 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 
 		/* compute attenuation over segment */
 		if(volume_shader_extinction_sample(kg, sd, state, new_P, &sigma_t)) {
-			/* todo: we could avoid computing expf() for each step by summing,
-			 * because exp(a)*exp(b) = exp(a+b), but we still want a quick
-			 * tp_eps check too */
-			tp *= volume_color_transmittance(sigma_t, new_t - t);
-
-			/* stop if nearly all light blocked */
-			if(tp.x < tp_eps && tp.y < tp_eps && tp.z < tp_eps)
-				break;
+			/* Compute expf() only for every Nth step, to save some calculations
+			 * because exp(a)*exp(b) = exp(a+b), also do a quick tp_eps check then. */
+
+			sum += (-sigma_t * (new_t - t));
+			if((i & 0x07) == 0) { /* ToDo: Other interval? */
+				tp = *throughput * make_float3(expf(sum.x), expf(sum.y), expf(sum.z));
+
+				/* stop if nearly all light is blocked */
+				if(tp.x < tp_eps && tp.y < tp_eps && tp.z < tp_eps)
+					break;
+			}
 		}
 
 		/* stop if at the end of the volume */
 		t = new_t;
-		if(t == ray->t)
+		if(t == ray->t) {
+			/* Update throughput in case we haven't done it above */
+			tp = *throughput * make_float3(expf(sum.x), expf(sum.y), expf(sum.z));
 			break;
+		}
 	}
 
 	*throughput = tp;
@@ -326,6 +334,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 	float t = ray->t;
 	float3 new_tp;
 
+#ifdef __VOLUME_SCATTER__
 	/* randomly scatter, and if we do t is shortened */
 	if(closure_flag & SD_SCATTER) {
 		/* extinction coefficient */
@@ -379,7 +388,9 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 			new_tp = *throughput * transmittance / pdf;
 		}
 	}
-	else if(closure_flag & SD_ABSORPTION) {
+	else 
+#endif
+	if(closure_flag & SD_ABSORPTION) {
 		/* absorption only, no sampling needed */
 		float3 transmittance = volume_color_transmittance(coeff.sigma_a, t);
 		new_tp = *throughput * transmittance;
@@ -456,6 +467,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 			bool scatter = false;
 
 			/* distance sampling */
+#ifdef __VOLUME_SCATTER__
 			if((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_ABSORPTION))) {
 				has_scatter = true;
 
@@ -491,7 +503,9 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 					xi = 1.0f - (1.0f - xi)/sample_transmittance;
 				}
 			}
-			else if(closure_flag & SD_ABSORPTION) {
+			else 
+#endif
+			if(closure_flag & SD_ABSORPTION) {
 				/* absorption only, no sampling needed */
 				float3 sigma_a = coeff.sigma_a;
 
@@ -546,13 +560,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
  * between the endpoints. distance sampling is used to decide if we will
  * scatter or not. */
 ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
-	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng)
+	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng, bool heterogeneous)
 {
 	/* workaround to fix correlation bug in T38710, can find better solution
 	 * in random number generator later, for now this is done here to not impact
 	 * performance of rendering without volumes */
 	RNG tmp_rng = cmj_hash(*rng, state->rng_offset);
-	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
 
 	shader_setup_from_volume(kg, sd, ray, state->bounce, state->transparent_bounce);
 
@@ -724,16 +737,15 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s
  * scattering, they always scatter if there is any non-zero scattering
  * coefficient.
  *
- * these also do not do emission or modify throughput. */
+ * these also do not do emission or modify throughput. 
+ * 
+ * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */
 ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd,
 	float3 *throughput, float rphase, float rscatter,
 	const VolumeSegment *segment, const float3 *light_P, bool probalistic_scatter)
 {
-	int closure_flag = segment->closure_flag;
-
-	if(!(closure_flag & SD_SCATTER))
-		return VOLUME_PATH_MISSED;
+	kernel_assert(segment->closure_flag & SD_SCATTER);
 
 	/* pick random color channel, we use the Veach one-sample
 	 * model with balance heuristic for the channels */
@@ -845,15 +857,33 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 		float3 step_pdf_distance = make_float3(1.0f, 1.0f, 1.0f);
 
 		if(segment->numsteps > 1) {
-			/* todo: optimize using binary search */
 			float3 prev_cdf_distance = make_float3(0.0f, 0.0f, 0.0f);
 
-			for(int i = 0; i < segment->numsteps-1; i++, step++) {
-				if(sample_t < step->t)
+			int numsteps = segment->numsteps;
+			int high = numsteps - 1;
+			int low = 0;
+			int mid;
+
+			while(low < high) {
+				mid = (low + high) >> 1;
+
+				if(sample_t < step[mid].t)
+					high = mid;
+				else if(sample_t >= step[mid + 1].t)
+					low = mid + 1;
+				else {
+					/* found our interval in step[mid] .. step[mid+1] */
+					prev_t = step[mid].t;
+					prev_cdf_distance = step[mid].cdf_distance;
+					step += mid+1;
 					break;
+				}
+			}
 
-				prev_t = step->t;
-				prev_cdf_distance = step->cdf_distance;
+			if(low >= numsteps - 1) {
+				prev_t = step[numsteps - 1].t;
+				prev_cdf_distance = step[numsteps-1].cdf_distance;
+				step += numsteps - 1;
 			}
 
 			/* pdf for picking step with distance sampling */
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 94337290d20..84ef85e089d 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -66,18 +66,6 @@ ClosureParam *closure_bssrdf_cubic_params()
 	static ClosureParam params[] = {
 		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, sc.N),
 		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius),
-		//CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.data1),
-	    CLOSURE_STRING_KEYPARAM("label"),
-	    CLOSURE_FINISH_PARAM(CubicBSSRDFClosure)
-	};
-	return params;
-}
-
-ClosureParam *closure_bssrdf_cubic_extended_params()
-{
-	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, sc.N),
-		CLOSURE_FLOAT3_PARAM(CubicBSSRDFClosure, radius),
 		CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.data1),
 		CLOSURE_FLOAT_PARAM(CubicBSSRDFClosure, sc.T.x),
 	    CLOSURE_STRING_KEYPARAM("label"),
@@ -107,18 +95,6 @@ ClosureParam *closure_bssrdf_gaussian_params()
 	static ClosureParam params[] = {
 		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, sc.N),
 		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius),
-		//CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, sc.data1),
-	    CLOSURE_STRING_KEYPARAM("label"),
-	    CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure)
-	};
-	return params;
-}
-
-ClosureParam *closure_bssrdf_gaussian_extended_params()
-{
-	static ClosureParam params[] = {
-		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, sc.N),
-		CLOSURE_FLOAT3_PARAM(GaussianBSSRDFClosure, radius),
 		CLOSURE_FLOAT_PARAM(GaussianBSSRDFClosure, sc.data1),
 	    CLOSURE_STRING_KEYPARAM("label"),
 	    CLOSURE_FINISH_PARAM(GaussianBSSRDFClosure)
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 2d716103b85..d7789edcfff 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -105,7 +105,7 @@ BSDF_CLOSURE_CLASS_BEGIN(AshikhminVelvet, ashikhmin_velvet, ashikhmin_velvet, LA
 	CLOSURE_FLOAT_PARAM(AshikhminVelvetClosure, sc.data0),
 BSDF_CLOSURE_CLASS_END(AshikhminVelvet, ashikhmin_velvet)
 
-BSDF_CLOSURE_CLASS_BEGIN(AshikhminShirley, ashikhmin_shirley_aniso, ashikhmin_shirley, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(AshikhminShirley, ashikhmin_shirley_aniso, ashikhmin_shirley, LABEL_GLOSSY|LABEL_REFLECT)
 	CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, sc.N),
 	CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, sc.T),
 	CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, sc.data0),
@@ -124,37 +124,37 @@ BSDF_CLOSURE_CLASS_BEGIN(GlossyToon, glossy_toon, glossy_toon, LABEL_GLOSSY)
 	CLOSURE_FLOAT_PARAM(GlossyToonClosure, sc.data1),
 BSDF_CLOSURE_CLASS_END(GlossyToon, glossy_toon)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGX, microfacet_ggx, microfacet_ggx, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGX, microfacet_ggx, microfacet_ggx, LABEL_GLOSSY|LABEL_REFLECT)
 	CLOSURE_FLOAT3_PARAM(MicrofacetGGXClosure, sc.N),
 	CLOSURE_FLOAT_PARAM(MicrofacetGGXClosure, sc.data0),
 BSDF_CLOSURE_CLASS_END(MicrofacetGGX, microfacet_ggx)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXAniso, microfacet_ggx_aniso, microfacet_ggx, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXAniso, microfacet_ggx_aniso, microfacet_ggx, LABEL_GLOSSY|LABEL_REFLECT)
 	CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, sc.N),
 	CLOSURE_FLOAT3_PARAM(MicrofacetGGXAnisoClosure, sc.T),
 	CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, sc.data0),
 	CLOSURE_FLOAT_PARAM(MicrofacetGGXAnisoClosure, sc.data1),
 BSDF_CLOSURE_CLASS_END(MicrofacetGGXAniso, microfacet_ggx_aniso)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmann, microfacet_beckmann, microfacet_beckmann, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmann, microfacet_beckmann, microfacet_beckmann, LABEL_GLOSSY|LABEL_REFLECT)
 	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannClosure, sc.N),
 	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannClosure, sc.data0),
 BSDF_CLOSURE_CLASS_END(MicrofacetBeckmann, microfacet_beckmann)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannAniso, microfacet_beckmann_aniso, microfacet_beckmann, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannAniso, microfacet_beckmann_aniso, microfacet_beckmann, LABEL_GLOSSY|LABEL_REFLECT)
 	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, sc.N),
 	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannAnisoClosure, sc.T),
 	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, sc.data0),
 	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannAnisoClosure, sc.data1),
 BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannAniso, microfacet_beckmann_aniso)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXRefraction, microfacet_ggx_refraction, microfacet_ggx, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXRefraction, microfacet_ggx_refraction, microfacet_ggx, LABEL_GLOSSY|LABEL_TRANSMIT)
 	CLOSURE_FLOAT3_PARAM(MicrofacetGGXRefractionClosure, sc.N),
 	CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data0),
 	CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, sc.data2),
 BSDF_CLOSURE_CLASS_END(MicrofacetGGXRefraction, microfacet_ggx_refraction)
 
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction, microfacet_beckmann, LABEL_GLOSSY)
+BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction, microfacet_beckmann, LABEL_GLOSSY|LABEL_TRANSMIT)
 	CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannRefractionClosure, sc.N),
 	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data0),
 	CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, sc.data2),
@@ -244,8 +244,6 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare);
 	register_closure(ss, "glossy_toon", id++,
 		bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare);
-	register_closure(ss, "specular_toon", id++,
-		bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare);
 	register_closure(ss, "westin_backscatter", id++,
 		bsdf_westin_backscatter_params(), bsdf_westin_backscatter_prepare);
 	register_closure(ss, "westin_sheen", id++,
@@ -267,10 +265,6 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		closure_bssrdf_cubic_params(), closure_bssrdf_cubic_prepare);
 	register_closure(ss, "bssrdf_gaussian", id++,
 		closure_bssrdf_gaussian_params(), closure_bssrdf_gaussian_prepare);
-	register_closure(ss, "bssrdf_cubic", id++,
-		closure_bssrdf_cubic_extended_params(), closure_bssrdf_cubic_prepare);
-	register_closure(ss, "bssrdf_gaussian", id++,
-		closure_bssrdf_gaussian_extended_params(), closure_bssrdf_gaussian_prepare);
 
 	register_closure(ss, "hair_reflection", id++,
 		bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare);
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index a543907e884..58d215295dc 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -52,8 +52,6 @@ OSL::ClosureParam *closure_westin_backscatter_params();
 OSL::ClosureParam *closure_westin_sheen_params();
 OSL::ClosureParam *closure_bssrdf_cubic_params();
 OSL::ClosureParam *closure_bssrdf_gaussian_params();
-OSL::ClosureParam *closure_bssrdf_cubic_extended_params();
-OSL::ClosureParam *closure_bssrdf_gaussian_extended_params();
 OSL::ClosureParam *closure_henyey_greenstein_volume_params();
 
 void closure_emission_prepare(OSL::RendererServices *, int id, void *data);
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 6a59a381f48..38cb5061346 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -126,7 +126,7 @@ void OSLRenderServices::thread_init(KernelGlobals *kernel_globals_, OSL::Texture
 	osl_ts = osl_ts_;
 }
 
-bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time)
+bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time)
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
@@ -156,7 +156,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr
 	return false;
 }
 
-bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time)
+bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time)
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
@@ -186,7 +186,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::Transform
 	return false;
 }
 
-bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from, float time)
+bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from, float time)
 {
 	KernelGlobals *kg = kernel_globals;
 
@@ -218,7 +218,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from, float ti
 	return false;
 }
 
-bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time)
+bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to, float time)
 {
 	KernelGlobals *kg = kernel_globals;
 
@@ -250,7 +250,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to, fl
 	return false;
 }
 
-bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform)
+bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform)
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
@@ -275,7 +275,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr
 	return false;
 }
 
-bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform)
+bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform)
 {
 	/* this is only used for shader and object space, we don't really have
 	 * a concept of shader space, so we just use object space for both. */
@@ -300,7 +300,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, OSL::Transform
 	return false;
 }
 
-bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from)
+bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from)
 {
 	KernelGlobals *kg = kernel_globals;
 
@@ -328,7 +328,7 @@ bool OSLRenderServices::get_matrix(OSL::Matrix44 &result, ustring from)
 	return false;
 }
 
-bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to)
+bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to)
 {
 	KernelGlobals *kg = kernel_globals;
 	
@@ -356,7 +356,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::Matrix44 &result, ustring to)
 	return false;
 }
 
-bool OSLRenderServices::get_array_attribute(void *renderstate, bool derivatives, 
+bool OSLRenderServices::get_array_attribute(OSL::ShaderGlobals *sg, bool derivatives, 
                                             ustring object, TypeDesc type, ustring name,
                                             int index, void *val)
 {
@@ -479,7 +479,7 @@ static bool set_attribute_int(int i, TypeDesc type, bool derivatives, void *val)
 
 static bool set_attribute_string(ustring str, TypeDesc type, bool derivatives, void *val)
 {
-	if(type.basetype == TypeDesc::INT && type.aggregate == TypeDesc::SCALAR && type.arraylen == 0) {
+	if(type.basetype == TypeDesc::STRING && type.aggregate == TypeDesc::SCALAR && type.arraylen == 0) {
 		ustring *sval = (ustring *)val;
 		sval[0] = str;
 
@@ -751,13 +751,19 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
 		return false;
 }
 
-bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustring object_name,
+bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name,
                                       TypeDesc type, ustring name, void *val)
 {
-	if (renderstate == NULL)
+	if (sg->renderstate == NULL)
 		return false;
 
-	ShaderData *sd = (ShaderData *)renderstate;
+	ShaderData *sd = (ShaderData *)(sg->renderstate);
+	return get_attribute(sd, derivatives, object_name, type, name, val);
+}
+
+bool OSLRenderServices::get_attribute(ShaderData *sd, bool derivatives, ustring object_name,
+                                      TypeDesc type, ustring name, void *val)
+{
 	KernelGlobals *kg = sd->osl_globals;
 	bool is_curve;
 	int object;
@@ -815,12 +821,12 @@ bool OSLRenderServices::get_attribute(void *renderstate, bool derivatives, ustri
 }
 
 bool OSLRenderServices::get_userdata(bool derivatives, ustring name, TypeDesc type, 
-                                     void *renderstate, void *val)
+                                     OSL::ShaderGlobals *sg, void *val)
 {
 	return false; /* disabled by lockgeom */
 }
 
-bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, void *renderstate)
+bool OSLRenderServices::has_userdata(ustring name, TypeDesc type, OSL::ShaderGlobals *sg)
 {
 	return false; /* never called by OSL */
 }
@@ -969,7 +975,7 @@ bool OSLRenderServices::environment(ustring filename, TextureOpt &options,
 	return status;
 }
 
-bool OSLRenderServices::get_texture_info(ustring filename, int subimage,
+bool OSLRenderServices::get_texture_info(OSL::ShaderGlobals *sg, ustring filename, int subimage,
                                          ustring dataname,
                                          TypeDesc datatype, void *data)
 {
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 069722d81b6..6f928a0d103 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -49,27 +49,29 @@ public:
 	
 	void thread_init(KernelGlobals *kernel_globals, OSL::TextureSystem *ts);
 
-	bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time);
-	bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time);
+	bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time);
+	bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform, float time);
 	
-	bool get_matrix(OSL::Matrix44 &result, ustring from, float time);
-	bool get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time);
+	bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from, float time);
+	bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring to, float time);
 	
-	bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform);
-	bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform);
+	bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform);
+	bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, OSL::TransformationPtr xform);
 	
-	bool get_matrix(OSL::Matrix44 &result, ustring from);
-	bool get_inverse_matrix(OSL::Matrix44 &result, ustring from);
+	bool get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from);
+	bool get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from);
 
-	bool get_array_attribute(void *renderstate, bool derivatives,
+	bool get_array_attribute(OSL::ShaderGlobals *sg, bool derivatives,
 	                         ustring object, TypeDesc type, ustring name,
 	                         int index, void *val);
-	bool get_attribute(void *renderstate, bool derivatives, ustring object,
+	bool get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object,
+	                   TypeDesc type, ustring name, void *val);
+	bool get_attribute(ShaderData *sd, bool derivatives, ustring object_name,
 	                   TypeDesc type, ustring name, void *val);
 
 	bool get_userdata(bool derivatives, ustring name, TypeDesc type,
-	                  void *renderstate, void *val);
-	bool has_userdata(ustring name, TypeDesc type, void *renderstate);
+	                  OSL::ShaderGlobals *sg, void *val);
+	bool has_userdata(ustring name, TypeDesc type, OSL::ShaderGlobals *sg);
 
 	int pointcloud_search(OSL::ShaderGlobals *sg, ustring filename, const OSL::Vec3 &center,
 	                      float radius, int max_points, bool sort, size_t *out_indices,
@@ -106,7 +108,7 @@ public:
 	                 OSL::ShaderGlobals *sg, const OSL::Vec3 &R,
 	                 const OSL::Vec3 &dRdx, const OSL::Vec3 &dRdy, float *result);
 
-	bool get_texture_info(ustring filename, int subimage,
+	bool get_texture_info(OSL::ShaderGlobals *sg, ustring filename, int subimage,
 	                      ustring dataname, TypeDesc datatype, void *data);
 
 	static bool get_background_attribute(KernelGlobals *kg, ShaderData *sd, ustring name,
@@ -157,6 +159,70 @@ public:
 	static ustring u_v;
 	static ustring u_empty;
 
+#if OSL_LIBRARY_VERSION_CODE < 10500
+	bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) {
+		return get_matrix(NULL, result, xform, time);
+	}
+
+	bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform, float time) {
+		return get_inverse_matrix(NULL, result, xform, time);
+	}
+
+	bool get_matrix(OSL::Matrix44 &result, ustring from, float time) {
+		return get_matrix(NULL, result, from, time);
+	}
+
+	bool get_inverse_matrix(OSL::Matrix44 &result, ustring to, float time) {
+		return get_inverse_matrix(NULL, result, to, time);
+	}
+
+	bool get_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) {
+		return get_matrix(NULL, result, xform);
+	}
+
+	bool get_inverse_matrix(OSL::Matrix44 &result, OSL::TransformationPtr xform) {
+		return get_inverse_matrix(NULL, result, xform);
+	}
+
+	bool get_matrix(OSL::Matrix44 &result, ustring from) {
+		return get_matrix(NULL, result, from);
+	}
+
+	bool get_inverse_matrix(OSL::Matrix44 &result, ustring to) {
+		return get_inverse_matrix(NULL, result, to);
+	}
+
+	bool get_array_attribute(void *renderstate, bool derivatives,
+	                         ustring object, TypeDesc type, ustring name,
+	                         int index, void *val) {
+		OSL::ShaderGlobals sg;
+		sg.renderstate = renderstate;
+		return get_array_attribute(&sg, derivatives,
+		                           object, type, name,
+		                           index, val);
+	}
+
+	bool get_attribute(void *renderstate, bool derivatives, ustring object_name,
+	                   TypeDesc type, ustring name, void *val) {
+		OSL::ShaderGlobals sg;
+		sg.renderstate = renderstate;
+		return get_attribute(&sg, derivatives, object_name, type, name, val);
+	}
+
+	bool has_userdata(ustring name, TypeDesc type, void *renderstate) {
+		return has_userdata(name, type, (OSL::ShaderGlobals *) renderstate);
+	}
+
+	bool get_userdata(bool derivatives, ustring name, TypeDesc type,
+	                  void *renderstate, void *val) {
+		return get_userdata(derivatives, name, type, (OSL::ShaderGlobals *) renderstate, val);
+	}
+
+	bool get_texture_info(ustring filename, int subimage,
+	                      ustring dataname, TypeDesc datatype, void *data) {
+		return get_texture_info(NULL, filename, subimage, dataname, datatype, data);
+	}
+#endif
 private:
 	KernelGlobals *kernel_globals;
 	OSL::TextureSystem *osl_ts;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 28135784db9..48498116874 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -164,11 +164,14 @@ static void flatten_surface_closure_tree(ShaderData *sd, int path_flag,
 					CBSDFClosure *bsdf = (CBSDFClosure *)prim;
 					int scattering = bsdf->scattering();
 
-					/* no caustics option */
-					if(scattering == LABEL_GLOSSY && (path_flag & PATH_RAY_DIFFUSE)) {
+					/* caustic options */
+					if((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
 						KernelGlobals *kg = sd->osl_globals;
-						if(kernel_data.integrator.no_caustics)
+
+						if((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
+						   (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) {
 							return;
+						}
 					}
 
 					/* sample weight */
diff --git a/intern/cycles/kernel/shaders/node_fresnel.h b/intern/cycles/kernel/shaders/node_fresnel.h
index 447a84255ef..9f10ba8023e 100644
--- a/intern/cycles/kernel/shaders/node_fresnel.h
+++ b/intern/cycles/kernel/shaders/node_fresnel.h
@@ -34,3 +34,16 @@ float fresnel_dielectric_cos(float cosi, float eta)
 	return result;
 }
 
+color fresnel_conductor(float cosi, color eta, color k)
+{
+	color cosi2 = color(cosi*cosi);
+	color one = color(1, 1, 1);
+	color tmp_f = eta * eta + k * k;
+	color tmp = tmp_f * cosi2;
+	color Rparl2 = (tmp - (2.0 * eta * cosi) + one) /
+					(tmp + (2.0 * eta * cosi) + one);
+	color Rperp2 = (tmp_f - (2.0 * eta * cosi) + cosi2) /
+					(tmp_f + (2.0 * eta * cosi) + cosi2);
+	return (Rparl2 + Rperp2) * 0.5;
+}
+
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index e39db8097f2..f8e5fd510ee 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -507,12 +507,8 @@ closure color hair_transmission(normal N, float roughnessu, float roughnessv, ve
 closure color henyey_greenstein(float g) BUILTIN;
 closure color absorption() BUILTIN;
 
-// Backwards compatibility
-closure color bssrdf_cubic(normal N, vector radius) BUILTIN;
-closure color bssrdf_gaussian(normal N, vector radius) BUILTIN;
-closure color specular_toon(normal N, float size, float smooth) BUILTIN;
-
 // Renderer state
+int backfacing () BUILTIN;
 int raytype (string typename) BUILTIN;
 // the individual 'isFOOray' functions are deprecated
 int iscameraray () { return raytype("camera"); }
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index d6663aae9db..c13eae813d6 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -236,7 +236,7 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 				if(stack_load_float(stack, node.z) == 1.0f)
 					offset += node.y;
 				break;
-#ifdef __IMAGE_TEXTURES__
+#ifdef __TEXTURES__
 			case NODE_TEX_IMAGE:
 				svm_node_tex_image(kg, sd, stack, node);
 				break;
@@ -246,8 +246,6 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_TEX_ENVIRONMENT:
 				svm_node_tex_environment(kg, sd, stack, node);
 				break;
-#endif
-#ifdef __PROCEDURAL_TEXTURES__
 			case NODE_TEX_SKY:
 				svm_node_tex_sky(kg, sd, stack, node, &offset);
 				break;
@@ -420,17 +418,13 @@ ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, Shade
 			case NODE_LIGHT_FALLOFF:
 				svm_node_light_falloff(sd, stack, node);
 				break;
-#endif			
-#ifdef __ANISOTROPIC__
+#endif
 			case NODE_TANGENT:
 				svm_node_tangent(kg, sd, stack, node);
 				break;
-#endif			
-#ifdef __NORMAL_MAP__
 			case NODE_NORMAL_MAP:
 				svm_node_normal_map(kg, sd, stack, node);
-				break;
-#endif			
+				break;	
 			case NODE_END:
 			default:
 				return;
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 5fcc44e478b..30110db3ef9 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -179,7 +179,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: {
 #ifdef __CAUSTICS_TRICKS__
-			if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE))
+			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
 			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
@@ -207,7 +207,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: {
 #ifdef __CAUSTICS_TRICKS__
-			if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE))
+			if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
 			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
@@ -244,8 +244,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID: {
 #ifdef __CAUSTICS_TRICKS__
-			if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE))
+			if(!kernel_data.integrator.caustics_reflective &&
+			   !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) {
 				break;
+			}
 #endif
 			/* index of refraction */
 			float eta = fmaxf(param2, 1e-5f);
@@ -262,12 +264,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			float sample_weight = sc->sample_weight;
 
 			sc = svm_node_closure_get_bsdf(sd, mix_weight*fresnel);
-
-			if(sc) {
-				sc->N = N;
-				svm_node_glass_setup(sd, sc, type, eta, roughness, false);
+#ifdef __CAUSTICS_TRICKS__
+			if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0)
+#endif
+			{
+				if(sc) {
+					sc->N = N;
+					svm_node_glass_setup(sd, sc, type, eta, roughness, false);
+				}
 			}
 
+#ifdef __CAUSTICS_TRICKS__
+			if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
+				break;
+#endif
+
 			/* refraction */
 			sc = &sd->closure[sd->num_closure];
 			sc->weight = weight;
@@ -286,7 +297,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: {
 #ifdef __CAUSTICS_TRICKS__
-			if(kernel_data.integrator.no_caustics && (path_flag & PATH_RAY_DIFFUSE))
+			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
 			ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
@@ -294,7 +305,6 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(sc) {
 				sc->N = N;
 
-#ifdef __ANISOTROPIC__
 				sc->T = stack_load_float3(stack, data_node.y);
 
 				/* rotate tangent */
@@ -324,9 +334,6 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					sd->flag |= bsdf_microfacet_ggx_aniso_setup(sc);
 				else
 					sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(sc);
-#else
-				sd->flag |= bsdf_diffuse_setup(sc);
-#endif
 			}
 			break;
 		}
@@ -371,7 +378,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 				if(sc) {
 					/* todo: giving a fixed weight here will cause issues when
-					 * mixing multiple BSDFS. energey will not be conserved and
+					 * mixing multiple BSDFS. energy will not be conserved and
 					 * the throughput can blow up after multiple bounces. we
 					 * better figure out a way to skip backfaces from rays
 					 * spawned by transmission from the front */
diff --git a/intern/cycles/kernel/svm/svm_convert.h b/intern/cycles/kernel/svm/svm_convert.h
index 2503912c5c6..b221e0728ec 100644
--- a/intern/cycles/kernel/svm/svm_convert.h
+++ b/intern/cycles/kernel/svm/svm_convert.h
@@ -45,13 +45,13 @@ ccl_device void svm_node_convert(ShaderData *sd, float *stack, uint type, uint f
 		}
 		case NODE_CONVERT_VF: {
 			float3 f = stack_load_float3(stack, from);
-			float g = (f.x + f.y + f.z)*(1.0f/3.0f);
+			float g = average(f);
 			stack_store_float(stack, to, g);
 			break;
 		}
 		case NODE_CONVERT_VI: {
 			float3 f = stack_load_float3(stack, from);
-			int i = (int)((f.x + f.y + f.z)*(1.0f/3.0f));
+			int i = (int)average(f);
 			stack_store_int(stack, to, i);
 			break;
 		}