211 files changed, 16247 insertions, 9172 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 5322f6abee1..7aab5f4a94a 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -1,31 +1,64 @@
 remove_extra_strict_flags()
 
 set(INC
-	.
-	../util
-	osl
-	svm
+	..
 )
 
 set(INC_SYS
 
 )
 
-set(SRC
+set(SRC_CPU_KERNELS
 	kernels/cpu/kernel.cpp
+	kernels/cpu/kernel_sse2.cpp
+	kernels/cpu/kernel_sse3.cpp
+	kernels/cpu/kernel_sse41.cpp
+	kernels/cpu/kernel_avx.cpp
+	kernels/cpu/kernel_avx2.cpp
+	kernels/cpu/kernel_split.cpp
+	kernels/cpu/kernel_split_sse2.cpp
+	kernels/cpu/kernel_split_sse3.cpp
+	kernels/cpu/kernel_split_sse41.cpp
+	kernels/cpu/kernel_split_avx.cpp
+	kernels/cpu/kernel_split_avx2.cpp
+	kernels/cpu/filter.cpp
+	kernels/cpu/filter_sse2.cpp
+	kernels/cpu/filter_sse3.cpp
+	kernels/cpu/filter_sse41.cpp
+	kernels/cpu/filter_avx.cpp
+	kernels/cpu/filter_avx2.cpp
+)
+
+set(SRC_CUDA_KERNELS
+	kernels/cuda/kernel.cu
+	kernels/cuda/kernel_split.cu
+	kernels/cuda/filter.cu
+)
+
+set(SRC_OPENCL_KERNELS
 	kernels/opencl/kernel.cl
+	kernels/opencl/kernel_state_buffer_size.cl
+	kernels/opencl/kernel_split.cl
 	kernels/opencl/kernel_data_init.cl
+	kernels/opencl/kernel_path_init.cl
 	kernels/opencl/kernel_queue_enqueue.cl
 	kernels/opencl/kernel_scene_intersect.cl
 	kernels/opencl/kernel_lamp_emission.cl
-	kernels/opencl/kernel_background_buffer_update.cl
+	kernels/opencl/kernel_do_volume.cl
+	kernels/opencl/kernel_indirect_background.cl
+	kernels/opencl/kernel_shader_setup.cl
+	kernels/opencl/kernel_shader_sort.cl
 	kernels/opencl/kernel_shader_eval.cl
 	kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+	kernels/opencl/kernel_subsurface_scatter.cl
 	kernels/opencl/kernel_direct_lighting.cl
-	kernels/opencl/kernel_shadow_blocked.cl
+	kernels/opencl/kernel_shadow_blocked_ao.cl
+	kernels/opencl/kernel_shadow_blocked_dl.cl
+	kernels/opencl/kernel_enqueue_inactive.cl
 	kernels/opencl/kernel_next_iteration_setup.cl
-	kernels/opencl/kernel_sum_all_radiance.cl
-	kernels/cuda/kernel.cu
+	kernels/opencl/kernel_indirect_subsurface.cl
+	kernels/opencl/kernel_buffer_update.cl
+	kernels/opencl/filter.cl
 )
 
 set(SRC_BVH_HEADERS
@@ -52,12 +85,10 @@ set(SRC_HEADERS
 	kernel_compat_cpu.h
 	kernel_compat_cuda.h
 	kernel_compat_opencl.h
-	kernel_debug.h
 	kernel_differential.h
 	kernel_emission.h
 	kernel_film.h
 	kernel_globals.h
-	kernel_image_opencl.h
 	kernel_jitter.h
 	kernel_light.h
 	kernel_math.h
@@ -68,6 +99,7 @@ set(SRC_HEADERS
 	kernel_path_common.h
 	kernel_path_state.h
 	kernel_path_surface.h
+	kernel_path_subsurface.h
 	kernel_path_volume.h
 	kernel_projection.h
 	kernel_queues.h
@@ -86,6 +118,18 @@ set(SRC_KERNELS_CPU_HEADERS
 	kernels/cpu/kernel_cpu.h
 	kernels/cpu/kernel_cpu_impl.h
 	kernels/cpu/kernel_cpu_image.h
+	kernels/cpu/filter_cpu.h
+	kernels/cpu/filter_cpu_impl.h
+)
+
+set(SRC_KERNELS_CUDA_HEADERS
+	kernels/cuda/kernel_config.h
+	kernels/cuda/kernel_cuda_image.h
+)
+
+set(SRC_KERNELS_OPENCL_HEADERS
+	kernels/opencl/kernel_split_function.h
+	kernels/opencl/kernel_opencl_image.h
 )
 
 set(SRC_CLOSURE_HEADERS
@@ -109,6 +153,8 @@ set(SRC_CLOSURE_HEADERS
 	closure/bssrdf.h
 	closure/emissive.h
 	closure/volume.h
+	closure/bsdf_principled_diffuse.h
+	closure/bsdf_principled_sheen.h
 )
 
 set(SRC_SVM_HEADERS
@@ -162,8 +208,11 @@ set(SRC_GEOM_HEADERS
 	geom/geom.h
 	geom/geom_attribute.h
 	geom/geom_curve.h
+	geom/geom_curve_intersect.h
 	geom/geom_motion_curve.h
 	geom/geom_motion_triangle.h
+	geom/geom_motion_triangle_intersect.h
+	geom/geom_motion_triangle_shader.h
 	geom/geom_object.h
 	geom/geom_patch.h
 	geom/geom_primitive.h
@@ -173,31 +222,93 @@ set(SRC_GEOM_HEADERS
 	geom/geom_volume.h
 )
 
+set(SRC_FILTER_HEADERS
+	filter/filter.h
+	filter/filter_defines.h
+	filter/filter_features.h
+	filter/filter_features_sse.h
+	filter/filter_kernel.h
+	filter/filter_nlm_cpu.h
+	filter/filter_nlm_gpu.h
+	filter/filter_prefilter.h
+	filter/filter_reconstruction.h
+	filter/filter_transform.h
+	filter/filter_transform_gpu.h
+	filter/filter_transform_sse.h
+)
+
 set(SRC_UTIL_HEADERS
 	../util/util_atomic.h
 	../util/util_color.h
+	../util/util_defines.h
 	../util/util_half.h
 	../util/util_hash.h
 	../util/util_math.h
 	../util/util_math_fast.h
+	../util/util_math_intersect.h
+	../util/util_math_float2.h
+	../util/util_math_float3.h
+	../util/util_math_float4.h
+	../util/util_math_int2.h
+	../util/util_math_int3.h
+	../util/util_math_int4.h
+	../util/util_math_matrix.h
 	../util/util_static_assert.h
 	../util/util_transform.h
 	../util/util_texture.h
 	../util/util_types.h
+	../util/util_types_float2.h
+	../util/util_types_float2_impl.h
+	../util/util_types_float3.h
+	../util/util_types_float3_impl.h
+	../util/util_types_float4.h
+	../util/util_types_float4_impl.h
+	../util/util_types_int2.h
+	../util/util_types_int2_impl.h
+	../util/util_types_int3.h
+	../util/util_types_int3_impl.h
+	../util/util_types_int4.h
+	../util/util_types_int4_impl.h
+	../util/util_types_uchar2.h
+	../util/util_types_uchar2_impl.h
+	../util/util_types_uchar3.h
+	../util/util_types_uchar3_impl.h
+	../util/util_types_uchar4.h
+	../util/util_types_uchar4_impl.h
+	../util/util_types_uint2.h
+	../util/util_types_uint2_impl.h
+	../util/util_types_uint3.h
+	../util/util_types_uint3_impl.h
+	../util/util_types_uint4.h
+	../util/util_types_uint4_impl.h
+	../util/util_types_vector3.h
+	../util/util_types_vector3_impl.h
 )
 
 set(SRC_SPLIT_HEADERS
-	split/kernel_background_buffer_update.h
+	split/kernel_branched.h
+	split/kernel_buffer_update.h
 	split/kernel_data_init.h
 	split/kernel_direct_lighting.h
+	split/kernel_do_volume.h
+	split/kernel_enqueue_inactive.h
 	split/kernel_holdout_emission_blurring_pathtermination_ao.h
+	split/kernel_indirect_background.h
+	split/kernel_indirect_subsurface.h
 	split/kernel_lamp_emission.h
 	split/kernel_next_iteration_setup.h
+	split/kernel_path_init.h
+	split/kernel_queue_enqueue.h
 	split/kernel_scene_intersect.h
+	split/kernel_shader_setup.h
+	split/kernel_shader_sort.h
 	split/kernel_shader_eval.h
-	split/kernel_shadow_blocked.h
+	split/kernel_shadow_blocked_ao.h
+	split/kernel_shadow_blocked_dl.h
 	split/kernel_split_common.h
-	split/kernel_sum_all_radiance.h
+	split/kernel_split_data.h
+	split/kernel_split_data_types.h
+	split/kernel_subsurface_scatter.h
 )
 
 # CUDA module
@@ -217,7 +328,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
 
 	# warn for other versions
-	if(CUDA_VERSION MATCHES "80")
+	if(CUDA_VERSION MATCHES "80" OR CUDA_VERSION MATCHES "90")
 	else()
 		message(WARNING
 			"CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
@@ -225,25 +336,31 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 
 	# build for each arch
-	set(cuda_sources kernels/cuda/kernel.cu
+	set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
 		${SRC_HEADERS}
+		${SRC_KERNELS_CUDA_HEADERS}
 		${SRC_BVH_HEADERS}
 		${SRC_SVM_HEADERS}
 		${SRC_GEOM_HEADERS}
 		${SRC_CLOSURE_HEADERS}
 		${SRC_UTIL_HEADERS}
 	)
+	set(cuda_filter_sources kernels/cuda/filter.cu
+		${SRC_HEADERS}
+		${SRC_KERNELS_CUDA_HEADERS}
+		${SRC_FILTER_HEADERS}
+		${SRC_UTIL_HEADERS}
+	)
 	set(cuda_cubins)
 
-	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
+	macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
 		if(${experimental})
-			set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
-			set(cuda_cubin kernel_experimental_${arch}.cubin)
-		else()
-			set(cuda_extra_flags "")
-			set(cuda_cubin kernel_${arch}.cubin)
+			set(flags ${flags} -D__KERNEL_EXPERIMENTAL__)
+			set(name ${name}_experimental)
 		endif()
 
+		set(cuda_cubin ${name}_${arch}.cubin)
+
 		if(WITH_CYCLES_DEBUG)
 			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
 		else()
@@ -256,26 +373,27 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
 		set(cuda_math_flags "--use_fast_math")
 
+		set(cuda_kernel_src "/kernels/cuda/${name}.cu")
+
 		add_custom_command(
 			OUTPUT ${cuda_cubin}
 			COMMAND ${cuda_nvcc_command}
 					-arch=${arch}
 					${CUDA_NVCC_FLAGS}
 					-m${CUDA_BITS}
-					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
 					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
 					--ptxas-options="-v"
 					${cuda_arch_flags}
 					${cuda_version_flags}
 					${cuda_math_flags}
-					${cuda_extra_flags}
+					${flags}
 					${cuda_debug_flags}
-					-I${CMAKE_CURRENT_SOURCE_DIR}/../util
-					-I${CMAKE_CURRENT_SOURCE_DIR}/svm
+					-I${CMAKE_CURRENT_SOURCE_DIR}/..
 					-DCCL_NAMESPACE_BEGIN=
 					-DCCL_NAMESPACE_END=
 					-DNVCC
-			DEPENDS ${cuda_sources})
+			DEPENDS ${sources})
 
 		delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
 		list(APPEND cuda_cubins ${cuda_cubin})
@@ -288,8 +406,18 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endmacro()
 
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
-		# Compile regular kernel
-		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
+		if(CUDA_VERSION MATCHES "90" AND ${arch} MATCHES "sm_2.")
+			message(STATUS "CUDA binaries for ${arch} disabled, not supported by CUDA 9.")
+		else()
+			# Compile regular kernel
+			CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
+			CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)
+
+			if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
+				# Compile split kernel
+				CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE)
+			endif()
+		endif()
 	endforeach()
 
 	add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
@@ -319,38 +447,45 @@ list(APPEND SRC_HEADERS
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-if(CXX_HAS_SSE)
-	list(APPEND SRC
-		kernels/cpu/kernel_sse2.cpp
-		kernels/cpu/kernel_sse3.cpp
-		kernels/cpu/kernel_sse41.cpp
-	)
+set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 
+if(CXX_HAS_SSE)
 	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX)
-	list(APPEND SRC
-		kernels/cpu/kernel_avx.cpp
-	)
 	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX2)
-	list(APPEND SRC
-		kernels/cpu/kernel_avx2.cpp
-	)
 	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 add_library(cycles_kernel
-	${SRC}
+	${SRC_CPU_KERNELS}
+	${SRC_CUDA_KERNELS}
+	${SRC_OPENCL_KERNELS}
 	${SRC_HEADERS}
 	${SRC_KERNELS_CPU_HEADERS}
+	${SRC_KERNELS_CUDA_HEADERS}
+	${SRC_KERNELS_OPENCL_HEADERS}
 	${SRC_BVH_HEADERS}
 	${SRC_CLOSURE_HEADERS}
+	${SRC_FILTER_HEADERS}
 	${SRC_SVM_HEADERS}
 	${SRC_GEOM_HEADERS}
 	${SRC_SPLIT_HEADERS}
@@ -370,24 +505,16 @@ endif()
 #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
 
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPENCL_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CUDA_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPENCL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/split)
 
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 36798982653..cf0c8542d69 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -27,43 +27,43 @@
 
 CCL_NAMESPACE_BEGIN
 
-#include "bvh_types.h"
+#include "kernel/bvh/bvh_types.h"
 
 /* Common QBVH functions. */
 #ifdef __QBVH__
-#  include "qbvh_nodes.h"
+#  include "kernel/bvh/qbvh_nodes.h"
 #endif
 
 /* Regular BVH traversal */
 
-#include "bvh_nodes.h"
+#include "kernel/bvh/bvh_nodes.h"
 
 #define BVH_FUNCTION_NAME bvh_intersect
 #define BVH_FUNCTION_FEATURES 0
-#include "bvh_traversal.h"
+#include "kernel/bvh/bvh_traversal.h"
 
 #if defined(__INSTANCING__)
 #  define BVH_FUNCTION_NAME bvh_intersect_instancing
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 #if defined(__HAIR__)
 #  define BVH_FUNCTION_NAME bvh_intersect_hair
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 #if defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_motion
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 #if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_hair_motion
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 /* Subsurface scattering BVH traversal */
@@ -71,12 +71,12 @@ CCL_NAMESPACE_BEGIN
 #if defined(__SUBSURFACE__)
 #  define BVH_FUNCTION_NAME bvh_intersect_subsurface
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_subsurface.h"
+#  include "kernel/bvh/bvh_subsurface.h"
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
 #    define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
-#    include "bvh_subsurface.h"
+#    include "kernel/bvh/bvh_subsurface.h"
 #  endif
 #endif  /* __SUBSURFACE__ */
 
@@ -85,18 +85,18 @@ CCL_NAMESPACE_BEGIN
 #if defined(__VOLUME__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_volume.h"
+#  include "kernel/bvh/bvh_volume.h"
 
 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_volume.h"
+#    include "kernel/bvh/bvh_volume.h"
 #  endif
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
-#    include "bvh_volume.h"
+#    include "kernel/bvh/bvh_volume.h"
 #  endif
 #endif  /* __VOLUME__ */
 
@@ -105,30 +105,30 @@ CCL_NAMESPACE_BEGIN
 #if defined(__SHADOW_RECORD_ALL__)
 #  define BVH_FUNCTION_NAME bvh_intersect_shadow_all
 #  define BVH_FUNCTION_FEATURES 0
-#  include "bvh_shadow_all.h"
+#  include "kernel/bvh/bvh_shadow_all.h"
 
 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 
 #  if defined(__HAIR__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 
 #  if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 #endif  /* __SHADOW_RECORD_ALL__ */
 
@@ -137,18 +137,18 @@ CCL_NAMESPACE_BEGIN
 #if defined(__VOLUME_RECORD_ALL__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_all
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_volume_all.h"
+#  include "kernel/bvh/bvh_volume_all.h"
 
 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_volume_all.h"
+#    include "kernel/bvh/bvh_volume_all.h"
 #  endif
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
-#    include "bvh_volume_all.h"
+#    include "kernel/bvh/bvh_volume_all.h"
 #  endif
 #endif  /* __VOLUME_RECORD_ALL__ */
 
@@ -202,8 +202,9 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
 }
 
 #ifdef __SUBSURFACE__
+/* Note: ray is passed by value to work around a possible CUDA compiler bug. */
 ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
-                                                     const Ray *ray,
+                                                     const Ray ray,
                                                      SubsurfaceIntersection *ss_isect,
                                                      int subsurface_object,
                                                      uint *lcg_state,
@@ -212,7 +213,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 		return bvh_intersect_subsurface_motion(kg,
-		                                       ray,
+		                                       &ray,
 		                                       ss_isect,
 		                                       subsurface_object,
 		                                       lcg_state,
@@ -220,7 +221,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
 	}
 #endif /* __OBJECT_MOTION__ */
 	return bvh_intersect_subsurface(kg,
-	                                ray,
+	                                &ray,
 	                                ss_isect,
 	                                subsurface_object,
 	                                lcg_state,
@@ -229,30 +230,63 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
 #endif
 
 #ifdef __SHADOW_RECORD_ALL__
-ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
+                                                     const Ray *ray,
+                                                     Intersection *isect,
+                                                     uint visibility,
+                                                     uint max_hits,
+                                                     uint *num_hits)
 {
 #  ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 #    ifdef __HAIR__
-		if(kernel_data.bvh.have_curves)
-			return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, max_hits, num_hits);
+		if(kernel_data.bvh.have_curves) {
+			return bvh_intersect_shadow_all_hair_motion(kg,
+			                                            ray,
+			                                            isect,
+			                                            visibility,
+			                                            max_hits,
+			                                            num_hits);
+		}
 #    endif /* __HAIR__ */
 
-		return bvh_intersect_shadow_all_motion(kg, ray, isect, max_hits, num_hits);
+		return bvh_intersect_shadow_all_motion(kg,
+		                                       ray,
+		                                       isect,
+		                                       visibility,
+		                                       max_hits,
+		                                       num_hits);
 	}
 #  endif /* __OBJECT_MOTION__ */
 
 #  ifdef __HAIR__
-	if(kernel_data.bvh.have_curves)
-		return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
+	if(kernel_data.bvh.have_curves) {
+		return bvh_intersect_shadow_all_hair(kg,
+		                                     ray,
+		                                     isect,
+		                                     visibility,
+		                                     max_hits,
+		                                     num_hits);
+	}
 #  endif /* __HAIR__ */
 
 #  ifdef __INSTANCING__
-	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+	if(kernel_data.bvh.have_instancing) {
+		return bvh_intersect_shadow_all_instancing(kg,
+		                                           ray,
+		                                           isect,
+		                                           visibility,
+		                                           max_hits,
+		                                           num_hits);
+	}
 #  endif /* __INSTANCING__ */
 
-	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+	return bvh_intersect_shadow_all(kg,
+	                                ray,
+	                                isect,
+	                                visibility,
+	                                max_hits,
+	                                num_hits);
 }
 #endif  /* __SHADOW_RECORD_ALL__ */
 
@@ -357,7 +391,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
 
-#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
+#if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__))
 /* ToDo: Move to another file? */
 ccl_device int intersections_compare(const void *a, const void *b)
 {
@@ -373,5 +407,28 @@ ccl_device int intersections_compare(const void *a, const void *b)
 }
 #endif
 
-CCL_NAMESPACE_END
+#if defined(__SHADOW_RECORD_ALL__)
+ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
+{
+#ifdef __KERNEL_GPU__
+	/* Use bubble sort which has more friendly memory pattern on GPU. */
+	bool swapped;
+	do {
+		swapped = false;
+		for(int j = 0; j < num_hits - 1; ++j) {
+			if(hits[j].t > hits[j + 1].t) {
+				struct Intersection tmp = hits[j];
+				hits[j] = hits[j + 1];
+				hits[j + 1] = tmp;
+				swapped = true;
+			}
+		}
+		--num_hits;
+	} while(swapped);
+#else
+	qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+#endif
+}
+#endif  /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index 726bef1794c..6c33dad5426 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -17,8 +17,8 @@
 // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
 // 3-vector which might be faster.
 ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
-                                                           int node_addr,
-                                                           int child)
+                                                                int node_addr,
+                                                                int child)
 {
 	Transform space;
 	const int child_addr = node_addr + child * 3;
@@ -31,12 +31,12 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k
 
 #if !defined(__KERNEL_SSE2__)
 ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
-                                                 const float3 P,
-                                                 const float3 idir,
-                                                 const float t,
-                                                 const int node_addr,
-                                                 const uint visibility,
-                                                 float dist[2])
+                                                      const float3 P,
+                                                      const float3 idir,
+                                                      const float t,
+                                                      const int node_addr,
+                                                      const uint visibility,
+                                                      float dist[2])
 {
 
 	/* fetch node data */
@@ -52,8 +52,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 	float c0hiy = (node1.z - P.y) * idir.y;
 	float c0loz = (node2.x - P.z) * idir.z;
 	float c0hiz = (node2.z - P.z) * idir.z;
-	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+	float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
+	float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
 
 	float c1lox = (node0.y - P.x) * idir.x;
 	float c1hix = (node0.w - P.x) * idir.x;
@@ -61,8 +61,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 	float c1hiy = (node1.w - P.y) * idir.y;
 	float c1loz = (node2.y - P.z) * idir.z;
 	float c1hiz = (node2.w - P.z) * idir.z;
-	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+	float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
+	float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
 
 	dist[0] = c0min;
 	dist[1] = c1min;
@@ -78,14 +78,14 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
-                                                        const float3 P,
-                                                        const float3 idir,
-                                                        const float t,
-                                                        const float difl,
-                                                        const float extmax,
-                                                        const int node_addr,
-                                                        const uint visibility,
-                                                        float dist[2])
+                                                             const float3 P,
+                                                             const float3 idir,
+                                                             const float t,
+                                                             const float difl,
+                                                             const float extmax,
+                                                             const int node_addr,
+                                                             const uint visibility,
+                                                             float dist[2])
 {
 
 	/* fetch node data */
@@ -101,8 +101,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
 	float c0hiy = (node1.z - P.y) * idir.y;
 	float c0loz = (node2.x - P.z) * idir.z;
 	float c0hiz = (node2.z - P.z) * idir.z;
-	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+	float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
+	float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
 
 	float c1lox = (node0.y - P.x) * idir.x;
 	float c1hix = (node0.w - P.x) * idir.x;
@@ -110,8 +110,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
 	float c1hiy = (node1.w - P.y) * idir.y;
 	float c1loz = (node2.y - P.z) * idir.z;
 	float c1hiz = (node2.w - P.z) * idir.z;
-	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+	float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
+	float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
 
 	if(difl != 0.0f) {
 		float hdiff = 1.0f + difl;
@@ -203,13 +203,13 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
-                                                   const float3 P,
-                                                   const float3 dir,
-                                                   const float3 idir,
-                                                   const float t,
-                                                   const int node_addr,
-                                                   const uint visibility,
-                                                   float dist[2])
+                                                        const float3 P,
+                                                        const float3 dir,
+                                                        const float3 idir,
+                                                        const float t,
+                                                        const int node_addr,
+                                                        const uint visibility,
+                                                        float dist[2])
 {
 	int mask = 0;
 	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
@@ -233,15 +233,15 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
-                                                          const float3 P,
-                                                          const float3 dir,
-                                                          const float3 idir,
-                                                          const float t,
-                                                          const float difl,
-                                                          const float extmax,
-                                                          const int node_addr,
-                                                          const uint visibility,
-                                                          float dist[2])
+                                                               const float3 P,
+                                                               const float3 dir,
+                                                               const float3 idir,
+                                                               const float t,
+                                                               const float difl,
+                                                               const float extmax,
+                                                               const int node_addr,
+                                                               const uint visibility,
+                                                               float dist[2])
 {
 	int mask = 0;
 	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
@@ -265,13 +265,13 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
 }
 
 ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
-                                         const float3 P,
-                                         const float3 dir,
-                                         const float3 idir,
-                                         const float t,
-                                         const int node_addr,
-                                         const uint visibility,
-                                         float dist[2])
+                                              const float3 P,
+                                              const float3 dir,
+                                              const float3 idir,
+                                              const float t,
+                                              const int node_addr,
+                                              const uint visibility,
+                                              float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -296,15 +296,15 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                const float3 P,
-                                                const float3 dir,
-                                                const float3 idir,
-                                                const float t,
-                                                const float difl,
-                                                const float extmax,
-                                                const int node_addr,
-                                                const uint visibility,
-                                                float dist[2])
+                                                     const float3 P,
+                                                     const float3 dir,
+                                                     const float3 idir,
+                                                     const float t,
+                                                     const float difl,
+                                                     const float extmax,
+                                                     const int node_addr,
+                                                     const uint visibility,
+                                                     float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -442,19 +442,19 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
-                                                   const float3 P,
-                                                   const float3 dir,
-                                                   const ssef& isect_near,
-                                                   const ssef& isect_far,
-                                                   const int node_addr,
-                                                   const uint visibility,
-                                                   float dist[2])
+                                                        const float3 P,
+                                                        const float3 dir,
+                                                        const ssef& isect_near,
+                                                        const ssef& isect_far,
+                                                        const int node_addr,
+                                                        const uint visibility,
+                                                        float dist[2])
 {
 	Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
 	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
 
 	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);;
+	       aligned_dir1 = transform_direction(&space1, dir);
 	float3 aligned_P0 = transform_point(&space0, P),
 	       aligned_P1 = transform_point(&space1, P);
 	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
@@ -483,8 +483,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 	ssef tfar_y = max(lower_y, upper_y);
 	ssef tfar_z = max(lower_z, upper_z);
 
-	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
 	sseb vmask = tnear <= tfar;
 	dist[0] = tnear.f[0];
 	dist[1] = tnear.f[1];
@@ -503,20 +503,20 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
-                                                          const float3 P,
-                                                          const float3 dir,
-                                                          const ssef& isect_near,
-                                                          const ssef& isect_far,
-                                                          const float difl,
-                                                          const int node_addr,
-                                                          const uint visibility,
-                                                          float dist[2])
+                                                               const float3 P,
+                                                               const float3 dir,
+                                                               const ssef& isect_near,
+                                                               const ssef& isect_far,
+                                                               const float difl,
+                                                               const int node_addr,
+                                                               const uint visibility,
+                                                               float dist[2])
 {
 	Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
 	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
 
 	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);;
+	       aligned_dir1 = transform_direction(&space1, dir);
 	float3 aligned_P0 = transform_point(&space0, P),
 	       aligned_P1 = transform_point(&space1, P);
 	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
@@ -545,8 +545,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
 	ssef tfar_y = max(lower_y, upper_y);
 	ssef tfar_z = max(lower_z, upper_z);
 
-	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
 	sseb vmask;
 	if(difl != 0.0f) {
 		const float round_down = 1.0f - difl;
@@ -574,17 +574,17 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
 }
 
 ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
-                                         const float3& P,
-                                         const float3& dir,
-                                         const ssef& isect_near,
-                                         const ssef& isect_far,
-                                         const ssef& tsplat,
-                                         const ssef Psplat[3],
-                                         const ssef idirsplat[3],
-                                         const shuffle_swap_t shufflexyz[3],
-                                         const int node_addr,
-                                         const uint visibility,
-                                         float dist[2])
+                                              const float3& P,
+                                              const float3& dir,
+                                              const ssef& isect_near,
+                                              const ssef& isect_far,
+                                              const ssef& tsplat,
+                                              const ssef Psplat[3],
+                                              const ssef idirsplat[3],
+                                              const shuffle_swap_t shufflexyz[3],
+                                              const int node_addr,
+                                              const uint visibility,
+                                              float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -612,19 +612,19 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                const float3& P,
-                                                const float3& dir,
-                                                const ssef& isect_near,
-                                                const ssef& isect_far,
-                                                const ssef& tsplat,
-                                                const ssef Psplat[3],
-                                                const ssef idirsplat[3],
-                                                const shuffle_swap_t shufflexyz[3],
-                                                const float difl,
-                                                const float extmax,
-                                                const int node_addr,
-                                                const uint visibility,
-                                                float dist[2])
+                                                     const float3& P,
+                                                     const float3& dir,
+                                                     const ssef& isect_near,
+                                                     const ssef& isect_far,
+                                                     const ssef& tsplat,
+                                                     const ssef Psplat[3],
+                                                     const ssef idirsplat[3],
+                                                     const shuffle_swap_t shufflexyz[3],
+                                                     const float difl,
+                                                     const float extmax,
+                                                     const int node_addr,
+                                                     const uint visibility,
+                                                     float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index 294362ea995..a6a4353562c 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_shadow_all.h"
+#  include "kernel/bvh/qbvh_shadow_all.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -45,6 +45,7 @@ ccl_device_inline
 bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                                  const Ray *ray,
                                  Intersection *isect_array,
+                                 const uint visibility,
                                  const uint max_hits,
                                  uint *num_hits)
 {
@@ -100,9 +101,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif  /* __KERNEL_SSE2__ */
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -121,7 +119,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               idir,
 				                               isect_t,
 				                               node_addr,
-				                               PATH_RAY_SHADOW,
+				                               visibility,
 				                               dist);
 #else // __KERNEL_SSE2__
 				traverse_mask = NODE_INTERSECT(kg,
@@ -136,7 +134,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               idirsplat,
 				                               shufflexyz,
 				                               node_addr,
-				                               PATH_RAY_SHADOW,
+				                               visibility,
 				                               dist);
 #endif // __KERNEL_SSE2__
 
@@ -187,8 +185,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 					/* primitive intersection */
 					while(prim_addr < prim_addr2) {
-						kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-
+						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
 						bool hit;
 
 						/* todo: specialized intersect functions which don't fill in
@@ -198,10 +195,10 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
 								hit = triangle_intersect(kg,
-								                         &isect_precalc,
 								                         isect_array,
 								                         P,
-								                         PATH_RAY_SHADOW,
+								                         dir,
+								                         visibility,
 								                         object,
 								                         prim_addr);
 								break;
@@ -213,7 +210,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								                                P,
 								                                dir,
 								                                ray->time,
-								                                PATH_RAY_SHADOW,
+								                                visibility,
 								                                object,
 								                                prim_addr);
 								break;
@@ -222,31 +219,32 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_HAIR)
 							case PRIMITIVE_CURVE:
 							case PRIMITIVE_MOTION_CURVE: {
+								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect_array,
-									                                   P,
-									                                   dir,
-									                                   PATH_RAY_SHADOW,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   type,
-									                                   NULL,
-									                                   0, 0);
+									hit = cardinal_curve_intersect(kg,
+									                               isect_array,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               NULL,
+									                               0, 0);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect_array,
-									                          P,
-									                          dir,
-									                          PATH_RAY_SHADOW,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          type,
-									                          NULL,
-									                          0, 0);
+									hit = curve_intersect(kg,
+									                      isect_array,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      NULL,
+									                      0, 0);
 								}
 								break;
 							}
@@ -308,12 +306,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+					isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+					isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
-					triangle_intersect_precalc(dir, &isect_precalc);
 					num_hits_in_instance = 0;
 					isect_array->t = isect_t;
 
@@ -353,22 +350,17 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
 #  endif
 
-				triangle_intersect_precalc(dir, &isect_precalc);
-
 				/* scale isect->t to adjust for instancing */
 				for(int i = 0; i < num_hits_in_instance; i++) {
 					(isect_array-i-1)->t *= t_fac;
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
-
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 			}
 
 			isect_t = tmax;
@@ -399,6 +391,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect_array,
+                                         const uint visibility,
                                          const uint max_hits,
                                          uint *num_hits)
 {
@@ -407,6 +400,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
 		                                    ray,
 		                                    isect_array,
+		                                    visibility,
 		                                    max_hits,
 		                                    num_hits);
 	}
@@ -417,6 +411,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
 		                                   ray,
 		                                   isect_array,
+		                                   visibility,
 		                                   max_hits,
 		                                   num_hits);
 	}
diff --git a/intern/cycles/kernel/bvh/bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h
index d9623c94b2e..bda7e34907a 100644
--- a/intern/cycles/kernel/bvh/bvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/bvh_subsurface.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_subsurface.h"
+#  include "kernel/bvh/qbvh_subsurface.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -72,19 +72,19 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	ss_isect->num_hits = 0;
 
 	const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object);
-	if(!(object_flag & SD_TRANSFORM_APPLIED)) {
+	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
 		Transform ob_itfm;
-		bvh_instance_motion_push(kg,
-		                         subsurface_object,
-		                         ray,
-		                         &P,
-		                         &dir,
-		                         &idir,
-		                         &isect_t,
-		                         &ob_itfm);
+		isect_t = bvh_instance_motion_push(kg,
+		                                   subsurface_object,
+		                                   ray,
+		                                   &P,
+		                                   &dir,
+		                                   &idir,
+		                                   isect_t,
+		                                   &ob_itfm);
 #else
-		bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
+		isect_t = bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, isect_t);
 #endif
 		object = subsurface_object;
 	}
@@ -109,9 +109,6 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -197,9 +194,9 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						for(; prim_addr < prim_addr2; prim_addr++) {
 							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 							triangle_intersect_subsurface(kg,
-							                              &isect_precalc,
 							                              ss_isect,
 							                              P,
+							                              dir,
 							                              object,
 							                              prim_addr,
 							                              isect_t,
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index a0e478e972b..ae8f54821f2 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_traversal.h"
+#  include "kernel/bvh/qbvh_traversal.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -104,9 +104,6 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -213,7 +210,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						--stack_ptr;
 					}
 				}
-				BVH_DEBUG_NEXT_STEP();
+				BVH_DEBUG_NEXT_NODE();
 			}
 
 			/* if node is leaf, fetch triangle list */
@@ -235,26 +232,26 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					switch(type & PRIMITIVE_ALL) {
 						case PRIMITIVE_TRIANGLE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_STEP();
+								BVH_DEBUG_NEXT_INTERSECTION();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(triangle_intersect(kg,
-								                      &isect_precalc,
 								                      isect,
 								                      P,
+								                      dir,
 								                      visibility,
 								                      object,
 								                      prim_addr))
 								{
 									/* shadow ray early termination */
 #if defined(__KERNEL_SSE2__)
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
 #  if BVH_FEATURE(BVH_HAIR)
 									tfar = ssef(isect->t);
 #  endif
 #else
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 #endif
 								}
@@ -264,7 +261,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_MOTION)
 						case PRIMITIVE_MOTION_TRIANGLE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_STEP();
+								BVH_DEBUG_NEXT_INTERSECTION();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(motion_triangle_intersect(kg,
 								                             isect,
@@ -277,14 +274,14 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								{
 									/* shadow ray early termination */
 #  if defined(__KERNEL_SSE2__)
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
 #    if BVH_FEATURE(BVH_HAIR)
 									tfar = ssef(isect->t);
 #    endif
 #  else
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 #  endif
 								}
@@ -296,48 +293,49 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						case PRIMITIVE_CURVE:
 						case PRIMITIVE_MOTION_CURVE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								BVH_DEBUG_NEXT_INTERSECTION();
+								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
+								kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
 								bool hit;
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect,
-									                                   P,
-									                                   dir,
-									                                   visibility,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   type,
-									                                   lcg_state,
-									                                   difl,
-									                                   extmax);
+									hit = cardinal_curve_intersect(kg,
+									                               isect,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               lcg_state,
+									                               difl,
+									                               extmax);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect,
-									                          P,
-									                          dir,
-									                          visibility,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          type,
-									                          lcg_state,
-									                          difl,
-									                          extmax);
+									hit = curve_intersect(kg,
+									                      isect,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      lcg_state,
+									                      difl,
+									                      extmax);
 								}
 								if(hit) {
 									/* shadow ray early termination */
 #  if defined(__KERNEL_SSE2__)
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
 #    if BVH_FEATURE(BVH_HAIR)
 									tfar = ssef(isect->t);
 #    endif
 #  else
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 #  endif
 								}
@@ -353,11 +351,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+					isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+					isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
-					triangle_intersect_precalc(dir, &isect_precalc);
 
 #  if defined(__KERNEL_SSE2__)
 					Psplat[0] = ssef(P.x);
@@ -390,11 +387,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 			/* instance pop */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
-			triangle_intersect_precalc(dir, &isect_precalc);
 
 #  if defined(__KERNEL_SSE2__)
 			Psplat[0] = ssef(P.x);
diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h
index c3abe2e157d..ead424aaaaf 100644
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -50,12 +50,17 @@ CCL_NAMESPACE_BEGIN
 #ifdef __KERNEL_DEBUG__
 #  define BVH_DEBUG_INIT() \
 	do { \
-		isect->num_traversal_steps = 0; \
+		isect->num_traversed_nodes = 0; \
 		isect->num_traversed_instances = 0; \
+		isect->num_intersections = 0; \
 	} while(0)
-#  define BVH_DEBUG_NEXT_STEP() \
+#  define BVH_DEBUG_NEXT_NODE() \
 	do { \
-		++isect->num_traversal_steps; \
+		++isect->num_traversed_nodes; \
+	} while(0)
+#  define BVH_DEBUG_NEXT_INTERSECTION() \
+	do { \
+		++isect->num_intersections; \
 	} while(0)
 #  define BVH_DEBUG_NEXT_INSTANCE() \
 	do { \
@@ -63,7 +68,8 @@ CCL_NAMESPACE_BEGIN
 	} while(0)
 #else  /* __KERNEL_DEBUG__ */
 #  define BVH_DEBUG_INIT()
-#  define BVH_DEBUG_NEXT_STEP()
+#  define BVH_DEBUG_NEXT_NODE()
+#  define BVH_DEBUG_NEXT_INTERSECTION()
 #  define BVH_DEBUG_NEXT_INSTANCE()
 #endif  /* __KERNEL_DEBUG__ */
 
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index f6db399080b..42e626c8e19 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_volume.h"
+#  include "kernel/bvh/qbvh_volume.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -97,9 +97,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 #if 1
 	/* try to intersect with VDB volumes */
 	int num_volumes = kernel_data.tables.num_volumes;
@@ -212,9 +209,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									continue;
 								}
 								triangle_intersect(kg,
-								                   &isect_precalc,
 								                   isect,
 								                   P,
+								                   dir,
 								                   visibility,
 								                   object,
 								                   prim_addr);
@@ -254,17 +251,13 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					/* instance push */
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
-
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
-
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+						isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+						isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
-
 #  if defined(__KERNEL_SSE2__)
 						Psplat[0] = ssef(P.x);
 						Psplat[1] = ssef(P.y);
@@ -301,13 +294,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 			/* instance pop */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 #  if defined(__KERNEL_SSE2__)
 			Psplat[0] = ssef(P.x);
 			Psplat[1] = ssef(P.y);
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index 04fe6e02b15..8c94512a0b9 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_volume_all.h"
+#  include "kernel/bvh/qbvh_volume_all.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -128,9 +128,6 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	}
 #endif
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -226,9 +223,9 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									continue;
 								}
 								hit = triangle_intersect(kg,
-								                         &isect_precalc,
 								                         isect_array,
 								                         P,
+								                         dir,
 								                         visibility,
 								                         object,
 								                         prim_addr);
@@ -314,16 +311,13 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					/* instance push */
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
-
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
-
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+						isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+						isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
 						num_hits_in_instance = 0;
 						isect_array->t = isect_t;
 
@@ -369,20 +363,17 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #  else
 				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 				/* Scale isect->t to adjust for instancing. */
 				for(int i = 0; i < num_hits_in_instance; i++) {
 					(isect_array-i-1)->t *= t_fac;
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 			}
 
 			isect_t = tmax;
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
index 6d22f0b0d6a..3036efd4198 100644
--- a/intern/cycles/kernel/bvh/qbvh_nodes.h
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -126,8 +126,8 @@ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg
 	const sseb vmask = cast(tnear) > cast(tfar);
 	int mask = (int)movemask(vmask)^0xf;
 #else
-	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
 	const sseb vmask = tnear <= tfar;
 	int mask = (int)movemask(vmask);
 #endif
@@ -174,8 +174,8 @@ ccl_device_inline int qbvh_aligned_node_intersect_robust(
 
 	const float round_down = 1.0f - difl;
 	const float round_up = 1.0f + difl;
-	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
 	const sseb vmask = round_down*tnear <= round_up*tfar;
 	*dist = tnear;
 	return (int)movemask(vmask);
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
index 5f4d06f12ea..522213f30ca 100644
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -33,6 +33,7 @@
 ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                              const Ray *ray,
                                              Intersection *isect_array,
+                                             const uint visibility,
                                              const uint max_hits,
                                              uint *num_hits)
 {
@@ -96,24 +97,28 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+				(void)inodes;
 
+				if(false
 #ifdef __VISIBILITY_FLAG__
-				if((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) {
+				   || ((__float_as_uint(inodes.x) & visibility) == 0)
+#endif
+#if BVH_FEATURE(BVH_MOTION)
+				   || UNLIKELY(ray->time < inodes.y)
+				   || UNLIKELY(ray->time > inodes.z)
+#endif
+				) {
 					/* Pop. */
 					node_addr = traversal_stack[stack_ptr].addr;
 					--stack_ptr;
 					continue;
 				}
-#endif
 
 				ssef dist;
 				int child_mask = NODE_INTERSECT(kg,
@@ -239,7 +244,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			if(node_addr < 0) {
 				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
 #ifdef __VISIBILITY_FLAG__
-				if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
+				if((__float_as_uint(leaf.z) & visibility) == 0) {
 					/* Pop. */
 					node_addr = traversal_stack[stack_ptr].addr;
 					--stack_ptr;
@@ -262,8 +267,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 					/* Primitive intersection. */
 					while(prim_addr < prim_addr2) {
-						kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-
+						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
 						bool hit;
 
 						/* todo: specialized intersect functions which don't fill in
@@ -273,10 +277,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
 								hit = triangle_intersect(kg,
-								                         &isect_precalc,
 								                         isect_array,
 								                         P,
-								                         PATH_RAY_SHADOW,
+								                         dir,
+								                         visibility,
 								                         object,
 								                         prim_addr);
 								break;
@@ -288,7 +292,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								                                P,
 								                                dir,
 								                                ray->time,
-								                                PATH_RAY_SHADOW,
+								                                visibility,
 								                                object,
 								                                prim_addr);
 								break;
@@ -297,31 +301,32 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_HAIR)
 							case PRIMITIVE_CURVE:
 							case PRIMITIVE_MOTION_CURVE: {
+								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect_array,
-									                                   P,
-									                                   dir,
-									                                   PATH_RAY_SHADOW,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   type,
-									                                   NULL,
-									                                   0, 0);
+									hit = cardinal_curve_intersect(kg,
+									                               isect_array,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               NULL,
+									                               0, 0);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect_array,
-									                          P,
-									                          dir,
-									                          PATH_RAY_SHADOW,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          type,
-									                          NULL,
-									                          0, 0);
+									hit = curve_intersect(kg,
+									                      isect_array,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      NULL,
+									                      0, 0);
 								}
 								break;
 							}
@@ -383,9 +388,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+					isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+					isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
 					num_hits_in_instance = 0;
@@ -407,8 +412,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-					triangle_intersect_precalc(dir, &isect_precalc);
-
 					++stack_ptr;
 					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
 					traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
@@ -438,11 +441,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
 			}
 
@@ -465,8 +467,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			--stack_ptr;
diff --git a/intern/cycles/kernel/bvh/qbvh_subsurface.h b/intern/cycles/kernel/bvh/qbvh_subsurface.h
index ccd36df034a..be7658d11d7 100644
--- a/intern/cycles/kernel/bvh/qbvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/qbvh_subsurface.h
@@ -61,19 +61,19 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	ss_isect->num_hits = 0;
 
 	const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object);
-	if(!(object_flag & SD_TRANSFORM_APPLIED)) {
+	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
 		Transform ob_itfm;
-		bvh_instance_motion_push(kg,
-		                         subsurface_object,
-		                         ray,
-		                         &P,
-		                         &dir,
-		                         &idir,
-		                         &isect_t,
-		                         &ob_itfm);
+		isect_t = bvh_instance_motion_push(kg,
+		                                   subsurface_object,
+		                                   ray,
+		                                   &P,
+		                                   &dir,
+		                                   &idir,
+		                                   isect_t,
+		                                   &ob_itfm);
 #else
-		bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
+		isect_t = bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, isect_t);
 #endif
 		object = subsurface_object;
 	}
@@ -105,9 +105,6 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
@@ -253,9 +250,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						for(; prim_addr < prim_addr2; prim_addr++) {
 							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 							triangle_intersect_subsurface(kg,
-							                              &isect_precalc,
 							                              ss_isect,
 							                              P,
+							                              dir,
 							                              object,
 							                              prim_addr,
 							                              isect_t,
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
index f2d8e558dcc..335a4afd47a 100644
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -106,20 +106,23 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+				(void)inodes;
 
 				if(UNLIKELY(node_dist > isect->t)
+#if BVH_FEATURE(BVH_MOTION)
+				   || UNLIKELY(ray->time < inodes.y)
+				   || UNLIKELY(ray->time > inodes.z)
+#endif
 #ifdef __VISIBILITY_FLAG__
-				   || (__float_as_uint(inodes.x) & visibility) == 0)
+				   || (__float_as_uint(inodes.x) & visibility) == 0
 #endif
+				 )
 				{
 					/* Pop. */
 					node_addr = traversal_stack[stack_ptr].addr;
@@ -131,7 +134,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				int child_mask;
 				ssef dist;
 
-				BVH_DEBUG_NEXT_STEP();
+				BVH_DEBUG_NEXT_NODE();
 
 #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
@@ -326,18 +329,18 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					switch(type & PRIMITIVE_ALL) {
 						case PRIMITIVE_TRIANGLE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_STEP();
+								BVH_DEBUG_NEXT_INTERSECTION();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(triangle_intersect(kg,
-								                      &isect_precalc,
 								                      isect,
 								                      P,
+								                      dir,
 								                      visibility,
 								                      object,
 								                      prim_addr)) {
 									tfar = ssef(isect->t);
 									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+									if(visibility & PATH_RAY_SHADOW_OPAQUE) {
 										return true;
 									}
 								}
@@ -347,7 +350,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_MOTION)
 						case PRIMITIVE_MOTION_TRIANGLE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_STEP();
+								BVH_DEBUG_NEXT_INTERSECTION();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(motion_triangle_intersect(kg,
 								                             isect,
@@ -359,7 +362,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								                             prim_addr)) {
 									tfar = ssef(isect->t);
 									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+									if(visibility & PATH_RAY_SHADOW_OPAQUE) {
 										return true;
 									}
 								}
@@ -371,41 +374,42 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						case PRIMITIVE_CURVE:
 						case PRIMITIVE_MOTION_CURVE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								BVH_DEBUG_NEXT_INTERSECTION();
+								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
+								kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
 								bool hit;
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect,
-									                                   P,
-									                                   dir,
-									                                   visibility,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   type,
-									                                   lcg_state,
-									                                   difl,
-									                                   extmax);
+									hit = cardinal_curve_intersect(kg,
+									                               isect,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               lcg_state,
+									                               difl,
+									                               extmax);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect,
-									                          P,
-									                          dir,
-									                          visibility,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          type,
-									                          lcg_state,
-									                          difl,
-									                          extmax);
+									hit = curve_intersect(kg,
+									                      isect,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      lcg_state,
+									                      difl,
+									                      extmax);
 								}
 								if(hit) {
 									tfar = ssef(isect->t);
 									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+									if(visibility & PATH_RAY_SHADOW_OPAQUE) {
 										return true;
 									}
 								}
@@ -442,8 +446,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-					triangle_intersect_precalc(dir, &isect_precalc);
-
 					++stack_ptr;
 					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
 					traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
@@ -463,9 +465,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 			/* Instance pop. */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
 			qbvh_near_far_idx_calc(idir,
@@ -484,8 +486,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			node_dist = traversal_stack[stack_ptr].dist;
diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h
index 989873b549b..bcda7bbd251 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume.h
@@ -91,9 +91,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 #if 1
 	/* try to intersect with VDB volumes */
 	int num_volumes = kernel_data.tables.num_volumes;
@@ -284,7 +281,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									continue;
 								}
 								/* Intersect ray against primitive. */
-								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, prim_addr);
+								triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
 							}
 							break;
 						}
@@ -311,13 +308,11 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					/* Instance push. */
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
-
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
-
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+						isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+						isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
 						qbvh_near_far_idx_calc(idir,
@@ -336,8 +331,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
-
 						++stack_ptr;
 						kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
 						traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
@@ -361,9 +354,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 			/* Instance pop. */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
 			qbvh_near_far_idx_calc(idir,
@@ -382,8 +375,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			--stack_ptr;
diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h
index 87bbca5d85c..26f31c379c3 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h
@@ -95,9 +95,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 #if 1
 	/* try to intersect with VDB volumes */
 	int num_volumes = kernel_data.tables.num_volumes;
@@ -298,7 +295,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									continue;
 								}
 								/* Intersect ray against primitive. */
-								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, prim_addr);
+								hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
 								if(hit) {
 									/* Move on to next entry in intersections array. */
 									isect_array++;
@@ -371,13 +368,11 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					/* Instance push. */
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
-
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
-
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+						isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+						isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
 						qbvh_near_far_idx_calc(idir,
@@ -396,7 +391,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
 						num_hits_in_instance = 0;
 						isect_array->t = isect_t;
 
@@ -435,11 +429,10 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
 			}
 
@@ -462,8 +455,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			--stack_ptr;
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index b7abc1ec507..e799855a65e 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -20,17 +20,17 @@ ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType ty
 {
 	kernel_assert(size <= sizeof(ShaderClosure));
 
-	int num_closure = ccl_fetch(sd, num_closure);
-	int num_closure_extra = ccl_fetch(sd, num_closure_extra);
+	int num_closure = sd->num_closure;
+	int num_closure_extra = sd->num_closure_extra;
 	if(num_closure + num_closure_extra >= MAX_CLOSURE)
 		return NULL;
 
-	ShaderClosure *sc = &ccl_fetch(sd, closure)[num_closure];
+	ShaderClosure *sc = &sd->closure[num_closure];
 
 	sc->type = type;
 	sc->weight = weight;
 
-	ccl_fetch(sd, num_closure)++;
+	sd->num_closure++;
 
 	return sc;
 }
@@ -44,25 +44,25 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size)
 	 * This lets us keep the same fast array iteration over closures, as we
 	 * found linked list iteration and iteration with skipping to be slower. */
 	int num_extra = ((size + sizeof(ShaderClosure) - 1) / sizeof(ShaderClosure));
-	int num_closure = ccl_fetch(sd, num_closure);
-	int num_closure_extra = ccl_fetch(sd, num_closure_extra) + num_extra;
+	int num_closure = sd->num_closure;
+	int num_closure_extra = sd->num_closure_extra + num_extra;
 
 	if(num_closure + num_closure_extra > MAX_CLOSURE) {
 		/* Remove previous closure. */
-		ccl_fetch(sd, num_closure)--;
-		ccl_fetch(sd, num_closure_extra)++;
+		sd->num_closure--;
+		sd->num_closure_extra++;
 		return NULL;
 	}
 
-	ccl_fetch(sd, num_closure_extra) = num_closure_extra;
-	return (ccl_addr_space void*)(ccl_fetch(sd, closure) + MAX_CLOSURE - num_closure_extra);
+	sd->num_closure_extra = num_closure_extra;
+	return (ccl_addr_space void*)(sd->closure + MAX_CLOSURE - num_closure_extra);
 }
 
 ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight)
 {
 	ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
 
-	if(!sc)
+	if(sc == NULL)
 		return NULL;
 
 	float sample_weight = fabsf(average(weight));
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 7e4d5fe2e37..86a00d2124d 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -14,126 +14,144 @@
  * limitations under the License.
  */
 
-#include "../closure/bsdf_ashikhmin_velvet.h"
-#include "../closure/bsdf_diffuse.h"
-#include "../closure/bsdf_oren_nayar.h"
-#include "../closure/bsdf_phong_ramp.h"
-#include "../closure/bsdf_diffuse_ramp.h"
-#include "../closure/bsdf_microfacet.h"
-#include "../closure/bsdf_microfacet_multi.h"
-#include "../closure/bsdf_reflection.h"
-#include "../closure/bsdf_refraction.h"
-#include "../closure/bsdf_transparent.h"
-#include "../closure/bsdf_ashikhmin_shirley.h"
-#include "../closure/bsdf_toon.h"
-#include "../closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_ashikhmin_velvet.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_oren_nayar.h"
+#include "kernel/closure/bsdf_phong_ramp.h"
+#include "kernel/closure/bsdf_diffuse_ramp.h"
+#include "kernel/closure/bsdf_microfacet.h"
+#include "kernel/closure/bsdf_microfacet_multi.h"
+#include "kernel/closure/bsdf_reflection.h"
+#include "kernel/closure/bsdf_refraction.h"
+#include "kernel/closure/bsdf_transparent.h"
+#include "kernel/closure/bsdf_ashikhmin_shirley.h"
+#include "kernel/closure/bsdf_toon.h"
+#include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
 #ifdef __SUBSURFACE__
-#  include "../closure/bssrdf.h"
+#  include "kernel/closure/bssrdf.h"
 #endif
 #ifdef __VOLUME__
-#  include "../closure/volume.h"
+#  include "kernel/closure/volume.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
 
 ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
-                                  ShaderData *sd,
-                                  const ShaderClosure *sc,
-                                  float randu,
-                                  float randv,
-                                  float3 *eval,
-                                  float3 *omega_in,
-                                  differential3 *domega_in,
-                                  float *pdf)
+                                       ShaderData *sd,
+                                       const ShaderClosure *sc,
+                                       float randu,
+                                       float randv,
+                                       float3 *eval,
+                                       float3 *omega_in,
+                                       differential3 *domega_in,
+                                       float *pdf)
 {
 	int label;
 
 	switch(sc->type) {
 		case CLOSURE_BSDF_DIFFUSE_ID:
 		case CLOSURE_BSDF_BSSRDF_ID:
-			label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __SVM__
 		case CLOSURE_BSDF_OREN_NAYAR_ID:
-			label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __OSL__
 		case CLOSURE_BSDF_PHONG_RAMP_ID:
-			label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-			label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		case CLOSURE_BSDF_TRANSLUCENT_ID:
-			label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFLECTION_ID:
-			label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFRACTION_ID:
-			label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_TRANSPARENT_ID:
-			label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-			label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-			label = bsdf_microfacet_multi_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
-			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state));
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
+			label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
 			break;
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-			label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
-			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state));
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
+			label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
 			break;
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-			label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-			label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-			label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-			label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_GLOSSY_TOON_ID:
-			label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-			label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-			label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
+#ifdef __PRINCIPLED__
+		case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+		case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+			label = bsdf_principled_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			break;
+		case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+			label = bsdf_principled_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			break;
+#endif  /* __PRINCIPLED__ */
 #endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-			label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		default:
@@ -157,75 +175,89 @@ float3 bsdf_eval(KernelGlobals *kg,
 {
 	float3 eval;
 
-	if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) {
+	if(dot(sd->Ng, omega_in) >= 0.0f) {
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __OSL__
 			case CLOSURE_BSDF_PHONG_RAMP_ID:
-				eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-				eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-				eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
+				eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-				eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
+				eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
+				break;
+#ifdef __PRINCIPLED__
+			case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+			case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+				eval = bsdf_principled_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
+			case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+				eval = bsdf_principled_sheen_eval_reflect(sc, sd->I, omega_in, pdf);
+				break;
+#endif  /* __PRINCIPLED__ */
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			default:
@@ -237,63 +269,77 @@ float3 bsdf_eval(KernelGlobals *kg,
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-				eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
+				eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-				eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
+				eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
+				break;
+#ifdef __PRINCIPLED__
+			case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+			case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+				eval = bsdf_principled_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
+			case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+				eval = bsdf_principled_sheen_eval_transmit(sc, sd->I, omega_in, pdf);
+				break;
+#endif  /* __PRINCIPLED__ */
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			default:
@@ -311,11 +357,16 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 #ifdef __SVM__
 	switch(sc->type) {
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
 			bsdf_microfacet_multi_ggx_blur(sc, roughness);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
 			bsdf_microfacet_ggx_blur(sc, roughness);
 			break;
@@ -349,10 +400,15 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
 		case CLOSURE_BSDF_REFLECTION_ID:
 		case CLOSURE_BSDF_REFRACTION_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
+		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
+		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
@@ -367,6 +423,11 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
 			return bsdf_hair_merge(a, b);
+#ifdef __PRINCIPLED__
+		case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
+		case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
+			return bsdf_principled_diffuse_merge(a, b);
+#endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
 			return volume_henyey_greenstein_merge(a, b);
@@ -379,5 +440,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
 #endif
 }
 
+/* Classifies a closure as diffuse-like or specular-like.
+ * This is needed for the denoising feature pass generation,
+ * which are written on the first bounce where more than 25%
+ * of the sampling weight belongs to diffuse-line closures. */
+ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc)
+{
+	if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+		return true;
+	}
+
+	if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) {
+		MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc;
+		return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f);
+	}
+
+	return false;
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index 1cd8246aa71..b6c896c754b 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -143,6 +143,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 {
 	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
 	float3 N = bsdf->N;
+	int label = LABEL_REFLECT | LABEL_GLOSSY;
 
 	float NdotI = dot(N, I);
 	if(NdotI > 0.0f) {
@@ -211,6 +212,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 			/* Some high number for MIS. */
 			*pdf = 1e6f;
 			*eval = make_float3(1e6f, 1e6f, 1e6f);
+			label = LABEL_REFLECT | LABEL_SINGULAR;
 		}
 		else {
 			/* leave the rest to eval_reflect */
@@ -224,7 +226,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 #endif
 	}
 
-	return LABEL_REFLECT|LABEL_GLOSSY;
+	return label;
 }
 
 
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 7e0f5a7ec75..a5ba2cb2972 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf {
 
 	float sigma;
 	float invsigma2;
-	float3 N;
 } VelvetBsdf;
 
 ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index dcd187f9305..ec6f1f20996 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct DiffuseBsdf {
 	SHADER_CLOSURE_BASE;
-	float3 N;
 } DiffuseBsdf;
 
 /* DIFFUSE */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index 2d982a95fe4..24f40af46a3 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct DiffuseRampBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float3 *colors;
 } DiffuseRampBsdf;
 
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index bede5f45e7e..daaa26dc6ad 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -267,7 +267,10 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng,
 
 	*eval = make_float3(*pdf, *pdf, *pdf);
 
-	kernel_assert(dot(locy, *omega_in) < 0.0f);
+	/* TODO(sergey): Should always be negative, but seems some precision issue
+	 * is involved here.
+	 */
+	kernel_assert(dot(locy, *omega_in) < 1e-4f);
 
 	return LABEL_TRANSMIT|LABEL_GLOSSY;
 }
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 0a8d14a00c2..a780bd0cf28 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -36,7 +36,8 @@
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct MicrofacetExtra {
-	float3 color;
+	float3 color, cspec0;
+	float clearcoat;
 } MicrofacetExtra;
 
 typedef ccl_addr_space struct MicrofacetBsdf {
@@ -45,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf {
 	float alpha_x, alpha_y, ior;
 	MicrofacetExtra *extra;
 	float3 T;
-	float3 N;
 } MicrofacetBsdf;
 
 /* Beckmann and GGX microfacet importance sampling. */
@@ -233,6 +233,36 @@ ccl_device_forceinline float3 microfacet_sample_stretched(
 	return normalize(make_float3(-slope_x, -slope_y, 1.0f));
 } 
 
+/* Calculate the reflection color
+ *
+ * If fresnel is used, the color is an interpolation of the F0 color and white
+ * with respect to the fresnel
+ *
+ * Else it is simply white
+ */
+ccl_device_forceinline float3 reflection_color(const MicrofacetBsdf *bsdf, float3 L, float3 H) {
+	float3 F = make_float3(1.0f, 1.0f, 1.0f);
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID
+	                   || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID
+	                   || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID);
+
+	if(use_fresnel) {
+		float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+
+		F = interpolate_fresnel_color(L, H, bsdf->ior, F0, bsdf->extra->cspec0);
+	}
+
+	return F;
+}
+
+ccl_device_forceinline float D_GTR1(float NdotH, float alpha)
+{
+	if(alpha >= 1.0f) return M_1_PI_F;
+	float alpha2 = alpha*alpha;
+	float t = 1.0f + (alpha2 - 1.0f) * NdotH*NdotH;
+	return (alpha2 - 1.0f) / (M_PI_F * logf(alpha2) * t);
+}
+
 /* GGX microfacet with Smith shadow-masking from:
  *
  * Microfacet Models for Refraction through Rough Surfaces
@@ -248,14 +278,52 @@ ccl_device_forceinline float3 microfacet_sample_stretched(
 
 ccl_device int bsdf_microfacet_ggx_setup(MicrofacetBsdf *bsdf)
 {
+	bsdf->extra = NULL;
+
 	bsdf->alpha_x = saturate(bsdf->alpha_x);
 	bsdf->alpha_y = bsdf->alpha_x;
-	
+
 	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
+ccl_device int bsdf_microfacet_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = bsdf->alpha_x;
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID;
+
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= 0.25f * bsdf->extra->clearcoat * F;
+
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = bsdf->alpha_x;
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID;
+
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
 ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b)
 {
 	const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf*)a;
@@ -266,23 +334,45 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur
 	       (bsdf_a->alpha_y == bsdf_b->alpha_y) &&
 	       (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
 	       (bsdf_a->ior == bsdf_b->ior) &&
-	       ((!bsdf_a->extra && !bsdf_b->extra) ||
+	       ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) ||
 	        ((bsdf_a->extra && bsdf_b->extra) &&
 	         (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color))));
 }
 
 ccl_device int bsdf_microfacet_ggx_aniso_setup(MicrofacetBsdf *bsdf)
 {
+	bsdf->extra = NULL;
+
 	bsdf->alpha_x = saturate(bsdf->alpha_x);
 	bsdf->alpha_y = saturate(bsdf->alpha_y);
-	
+
 	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL;
 }
 
+ccl_device int bsdf_microfacet_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
+	bsdf->alpha_x = saturate(bsdf->alpha_x);
+	bsdf->alpha_y = saturate(bsdf->alpha_y);
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID;
+
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
 ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf)
 {
+	bsdf->extra = NULL;
+
 	bsdf->alpha_x = saturate(bsdf->alpha_x);
 	bsdf->alpha_y = bsdf->alpha_x;
 
@@ -319,6 +409,8 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 		float alpha2 = alpha_x * alpha_y;
 		float D, G1o, G1i;
 
+		bool is_principled_clearcoat = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
+
 		if(alpha_x == alpha_y) {
 			/* isotropic
 			 * eq. 20: (F*G*D)/(4*in*on)
@@ -327,7 +419,18 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 			float cosThetaM2 = cosThetaM * cosThetaM;
 			float cosThetaM4 = cosThetaM2 * cosThetaM2;
 			float tanThetaM2 = (1 - cosThetaM2) / cosThetaM2;
-			D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+
+			if(is_principled_clearcoat) {
+				/* use GTR1 for clearcoat */
+				D = D_GTR1(cosThetaM, bsdf->alpha_x);
+
+				/* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */
+				alpha2 = 0.0625f;
+			}
+			else {
+				/* use GTR2 otherwise */
+				D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+			}
 
 			/* eq. 34: now calculate G1(i,m) and G1(o,m) */
 			G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
@@ -374,7 +477,13 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 
 		/* eq. 20 */
 		float common = D * 0.25f / cosNO;
-		float out = G * common;
+
+		float3 F = reflection_color(bsdf, omega_in, m);
+		if(is_principled_clearcoat) {
+			F *= 0.25f * bsdf->extra->clearcoat;
+		}
+
+		float3 out = F * G * common;
 
 		/* eq. 2 in distribution of visible normals sampling
 		 * pm = Dw = G1o * dot(m, I) * D / dot(N, I); */
@@ -384,7 +493,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(const ShaderClosure *sc, cons
 		 * pdf = pm * 0.25 / dot(m, I); */
 		*pdf = G1o * common;
 
-		return make_float3(out, out, out);
+		return out;
 	}
 
 	return make_float3(0.0f, 0.0f, 0.0f);
@@ -452,6 +561,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 	float alpha_y = bsdf->alpha_y;
 	bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 	float3 N = bsdf->N;
+	int label;
 
 	float cosNO = dot(N, I);
 	if(cosNO > 0) {
@@ -477,6 +587,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 		/* reflection or refraction? */
 		if(!m_refractive) {
 			float cosMO = dot(m, I);
+			label = LABEL_REFLECT | LABEL_GLOSSY;
 
 			if(cosMO > 0) {
 				/* eq. 39 - compute actual reflected direction */
@@ -487,6 +598,17 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 						/* some high number for MIS */
 						*pdf = 1e6f;
 						*eval = make_float3(1e6f, 1e6f, 1e6f);
+
+						bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID
+						                   || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID
+						                   || bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID);
+
+						/* if fresnel is used, calculate the color with reflection_color(...) */
+						if(use_fresnel) {
+							*eval *= reflection_color(bsdf, *omega_in, m);
+						}
+
+						label = LABEL_REFLECT | LABEL_SINGULAR;
 					}
 					else {
 						/* microfacet normal is visible to this ray */
@@ -494,16 +616,32 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 						float alpha2 = alpha_x * alpha_y;
 						float D, G1i;
 
+						bool is_principled_clearcoat = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
+
 						if(alpha_x == alpha_y) {
 							/* isotropic */
 							float cosThetaM2 = cosThetaM * cosThetaM;
 							float cosThetaM4 = cosThetaM2 * cosThetaM2;
 							float tanThetaM2 = 1/(cosThetaM2) - 1;
-							D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
 
 							/* eval BRDF*cosNI */
 							float cosNI = dot(N, *omega_in);
 
+							if(is_principled_clearcoat) {
+								/* use GTR1 for clearcoat */
+								D = D_GTR1(cosThetaM, bsdf->alpha_x);
+
+								/* the alpha value for clearcoat is a fixed 0.25 => alpha2 = 0.25 * 0.25 */
+								alpha2 = 0.0625f;
+
+								/* recalculate G1o */
+								G1o = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNO * cosNO) / (cosNO * cosNO)));
+							}
+							else {
+								/* use GTR2 otherwise */
+								D = alpha2 / (M_PI_F * cosThetaM4 * (alpha2 + tanThetaM2) * (alpha2 + tanThetaM2));
+							}
+
 							/* eq. 34: now calculate G1(i,m) */
 							G1i = 2 / (1 + safe_sqrtf(1 + alpha2 * (1 - cosNI * cosNI) / (cosNI * cosNI))); 
 						}
@@ -535,10 +673,14 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 
 						/* see eval function for derivation */
 						float common = (G1o * D) * 0.25f / cosNO;
-						float out = G1i * common;
 						*pdf = common;
 
-						*eval = make_float3(out, out, out);
+						float3 F = reflection_color(bsdf, *omega_in, m);
+						if(is_principled_clearcoat) {
+							F *= 0.25f * bsdf->extra->clearcoat;
+						}
+
+						*eval = G1i * common * F;
 					}
 
 #ifdef __RAY_DIFFERENTIALS__
@@ -549,6 +691,8 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 			}
 		}
 		else {
+			label = LABEL_TRANSMIT | LABEL_GLOSSY;
+
 			/* CAUTION: the i and o variables are inverted relative to the paper
 			 * eq. 39 - compute actual refractive direction */
 			float3 R, T;
@@ -576,6 +720,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 					/* some high number for MIS */
 					*pdf = 1e6f;
 					*eval = make_float3(1e6f, 1e6f, 1e6f);
+					label = LABEL_TRANSMIT | LABEL_SINGULAR;
 				}
 				else {
 					/* eq. 33 */
@@ -607,7 +752,10 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 			}
 		}
 	}
-	return (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY;
+	else {
+		label = (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY;
+	}
+	return label;
 }
 
 /* Beckmann microfacet with Smith shadow-masking from:
@@ -815,6 +963,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 	float alpha_y = bsdf->alpha_y;
 	bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 	float3 N = bsdf->N;
+	int label;
 
 	float cosNO = dot(N, I);
 	if(cosNO > 0) {
@@ -839,6 +988,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 
 		/* reflection or refraction? */
 		if(!m_refractive) {
+			label = LABEL_REFLECT | LABEL_GLOSSY;
 			float cosMO = dot(m, I);
 
 			if(cosMO > 0) {
@@ -850,6 +1000,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 						/* some high number for MIS */
 						*pdf = 1e6f;
 						*eval = make_float3(1e6f, 1e6f, 1e6f);
+						label = LABEL_REFLECT | LABEL_SINGULAR;
 					}
 					else {
 						/* microfacet normal is visible to this ray
@@ -904,6 +1055,8 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 			}
 		}
 		else {
+			label = LABEL_TRANSMIT | LABEL_GLOSSY;
+
 			/* CAUTION: the i and o variables are inverted relative to the paper
 			 * eq. 39 - compute actual refractive direction */
 			float3 R, T;
@@ -931,6 +1084,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 					/* some high number for MIS */
 					*pdf = 1e6f;
 					*eval = make_float3(1e6f, 1e6f, 1e6f);
+					label = LABEL_TRANSMIT | LABEL_SINGULAR;
 				}
 				else {
 					/* eq. 33 */
@@ -963,7 +1117,10 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 			}
 		}
 	}
-	return (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY;
+	else {
+		label = (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY;
+	}
+	return label;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index cea59adfebe..2f2c35d5d1f 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -40,20 +40,20 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
 }
 
 /* Sample slope distribution (based on page 14 of the supplemental implementation). */
-ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
+ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float randx, const float randy)
 {
-	if(cosI > 0.9999f || cosI < 1e-6f) {
-		const float r = sqrtf(randU.x / (1.0f - randU.x));
-		const float phi = M_2PI_F * randU.y;
+	if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
+		const float r = sqrtf(randx / max(1.0f - randx, 1e-7f));
+		const float phi = M_2PI_F * randy;
 		return make_float2(r*cosf(phi), r*sinf(phi));
 	}
 
-	const float sinI = sqrtf(1.0f - cosI*cosI);
+	const float sinI = safe_sqrtf(1.0f - cosI*cosI);
 	const float tanI = sinI/cosI;
 	const float projA = 0.5f * (cosI + 1.0f);
 	if(projA < 0.0001f)
 		return make_float2(0.0f, 0.0f);
-	const float A = 2.0f*randU.x*projA / cosI - 1.0f;
+	const float A = 2.0f*randx*projA / cosI - 1.0f;
 	float tmp = A*A-1.0f;
 	if(fabsf(tmp) < 1e-7f)
 		return make_float2(0.0f, 0.0f);
@@ -64,26 +64,26 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 ran
 	const float slopeX = (A < 0.0f || slopeX2 > 1.0f/tanI)? (tanI*tmp - D) : slopeX2;
 
 	float U2;
-	if(randU.y >= 0.5f)
-		U2 = 2.0f*(randU.y - 0.5f);
+	if(randy >= 0.5f)
+		U2 = 2.0f*(randy - 0.5f);
 	else
-		U2 = 2.0f*(0.5f - randU.y);
+		U2 = 2.0f*(0.5f - randy);
 	const float z = (U2*(U2*(U2*0.27385f-0.73369f)+0.46341f)) / (U2*(U2*(U2*0.093073f+0.309420f)-1.0f)+0.597999f);
 	const float slopeY = z * sqrtf(1.0f + slopeX*slopeX);
 
-	if(randU.y >= 0.5f)
+	if(randy >= 0.5f)
 		return make_float2(slopeX, slopeY);
 	else
 		return make_float2(slopeX, -slopeY);
 }
 
 /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */
-ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
+ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float randx, const float randy)
 {
 	const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
-	const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);
+	const float2 slope_11 = mf_sampleP22_11(wi_11.z, randx, randy);
 
-	const float2 cossin_phi = normalize(make_float2(wi_11.x, wi_11.y));
+	const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f));
 	const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y);
 	const float slope_y = alpha.y*(cossin_phi.y * slope_11.x + cossin_phi.x * slope_11.y);
 
@@ -91,18 +91,15 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha
 	return normalize(make_float3(-slope_x, -slope_y, 1.0f));
 }
 
-/* === Phase functions: Glossy, Diffuse and Glass === */
+/* === Phase functions: Glossy and Glass === */
 
-/* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */
-ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
+/* Phase function for reflective materials. */
+ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *weight, const float3 wm)
 {
-	if(n && k)
-		*weight *= fresnel_conductor(dot(wi, wm), *n, *k);
-
 	return -wi + 2.0f * wm * dot(wi, wm);
 }
 
-ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
+ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha)
 {
 	if(w.z > 0.9999f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -123,30 +120,9 @@ ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float l
 	else
 		phase *= D_ggx_aniso(wh, alpha);
 
-	if(n && k) {
-		/* Apply conductive fresnel term. */
-		return phase * fresnel_conductor(dotW_WH, *n, *k);
-	}
-
 	return make_float3(phase, phase, phase);
 }
 
-/* Phase function for rough lambertian diffuse surfaces. */
-ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
-{
-	float3 tm, bm;
-	make_orthonormals(wm, &tm, &bm);
-
-	float2 disk = concentric_sample_disk(randu, randv);
-	return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm;
-}
-
-ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
-{
-	const float v = max(0.0f, dot(w, wm)) * M_1_PI_F;
-	return make_float3(v, v, v);
-}
-
 /* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */
 ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
 {
@@ -269,40 +245,69 @@ ccl_device_forceinline float mf_ggx_albedo(float r)
 	return saturate(albedo);
 }
 
+ccl_device_inline float mf_ggx_transmission_albedo(float a, float ior)
+{
+	if(ior < 1.0f) {
+		ior = 1.0f/ior;
+	}
+	a = saturate(a);
+	ior = clamp(ior, 1.0f, 3.0f);
+	float I_1 = 0.0476898f*expf(-0.978352f*(ior-0.65657f)*(ior-0.65657f)) - 0.033756f*ior + 0.993261f;
+	float R_1 = (((0.116991f*a - 0.270369f)*a + 0.0501366f)*a - 0.00411511f)*a + 1.00008f;
+	float I_2 = (((-2.08704f*ior + 26.3298f)*ior - 127.906f)*ior + 292.958f)*ior - 287.946f + 199.803f/(ior*ior) - 101.668f/(ior*ior*ior);
+	float R_2 = ((((5.3725f*a -24.9307f)*a + 22.7437f)*a - 3.40751f)*a + 0.0986325f)*a + 0.00493504f;
+
+	return saturate(1.0f + I_2*R_2*0.0019127f - (1.0f - I_1)*(1.0f - R_1)*9.3205f);
+}
+
 ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
 {
 	float D = D_ggx(normalize(wi+wo), alpha);
 	float lambda = mf_lambda(wi, make_float2(alpha, alpha));
+	float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f);
+
+	float multiscatter = wo.z * M_1_PI_F;
+
 	float albedo = mf_ggx_albedo(alpha);
-	return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z;
+	return albedo*singlescatter + (1.0f - albedo)*multiscatter;
 }
 
 ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
 {
-	return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z;
-}
+	float D = D_ggx_aniso(normalize(wi+wo), alpha);
+	float lambda = mf_lambda(wi, alpha);
+	float singlescatter = 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f);
 
-ccl_device_forceinline float mf_diffuse_pdf(const float3 wo)
-{
-	return M_1_PI_F * wo.z;
+	float multiscatter = wo.z * M_1_PI_F;
+
+	float albedo = mf_ggx_albedo(sqrtf(alpha.x*alpha.y));
+	return albedo*singlescatter + (1.0f - albedo)*multiscatter;
 }
 
 ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
 {
-	float3 wh;
-	float fresnel;
-	if(wi.z*wo.z > 0.0f) {
-		wh = normalize(wi + wo);
-		fresnel = fresnel_dielectric_cos(dot(wi, wh), eta);
-	}
-	else {
-		wh = normalize(wi + wo*eta);
-		fresnel = 1.0f - fresnel_dielectric_cos(dot(wi, wh), eta);
-	}
+	bool reflective = (wi.z*wo.z > 0.0f);
+
+	float wh_len;
+	float3 wh = normalize_len(wi + (reflective? wo : (wo*eta)), &wh_len);
 	if(wh.z < 0.0f)
 		wh = -wh;
 	float3 r_wi = (wi.z < 0.0f)? -wi: wi;
-	return fresnel * max(0.0f, dot(r_wi, wh)) * D_ggx(wh, alpha) / ((1.0f + mf_lambda(r_wi, make_float2(alpha, alpha))) * r_wi.z) + fabsf(wo.z);
+	float lambda = mf_lambda(r_wi, make_float2(alpha, alpha));
+	float D = D_ggx(wh, alpha);
+	float fresnel = fresnel_dielectric_cos(dot(r_wi, wh), eta);
+
+	float multiscatter = fabsf(wo.z * M_1_PI_F);
+	if(reflective) {
+		float singlescatter = 0.25f * D / max((1.0f + lambda) * r_wi.z, 1e-7f);
+		float albedo = mf_ggx_albedo(alpha);
+		return fresnel * (albedo*singlescatter + (1.0f - albedo)*multiscatter);
+	}
+	else {
+		float singlescatter = fabsf(dot(r_wi, wh)*dot(wo, wh) * D * eta*eta / max((1.0f + lambda) * r_wi.z * wh_len*wh_len, 1e-7f));
+		float albedo = mf_ggx_transmission_albedo(alpha, eta);
+		return (1.0f - fresnel) * (albedo*singlescatter + (1.0f - albedo)*multiscatter);
+	}
 }
 
 /* === Actual random walk implementations, one version of mf_eval and mf_sample per phase function. === */
@@ -313,18 +318,11 @@ ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, cons
 
 #define MF_PHASE_FUNCTION glass
 #define MF_MULTI_GLASS
-#include "bsdf_microfacet_multi_impl.h"
-
-/* The diffuse phase function is not implemented as a node yet. */
-#if 0
-#define MF_PHASE_FUNCTION diffuse
-#define MF_MULTI_DIFFUSE
-#include "bsdf_microfacet_multi_impl.h"
-#endif
+#include "kernel/closure/bsdf_microfacet_multi_impl.h"
 
 #define MF_PHASE_FUNCTION glossy
 #define MF_MULTI_GLOSSY
-#include "bsdf_microfacet_multi_impl.h"
+#include "kernel/closure/bsdf_microfacet_multi_impl.h"
 
 ccl_device void bsdf_microfacet_multi_ggx_blur(ShaderClosure *sc, float roughness)
 {
@@ -345,8 +343,9 @@ ccl_device int bsdf_microfacet_multi_ggx_common_setup(MicrofacetBsdf *bsdf)
 	bsdf->extra->color.x = saturate(bsdf->extra->color.x);
 	bsdf->extra->color.y = saturate(bsdf->extra->color.y);
 	bsdf->extra->color.z = saturate(bsdf->extra->color.z);
-
-	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
 
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
 }
@@ -356,6 +355,22 @@ ccl_device int bsdf_microfacet_multi_ggx_aniso_setup(MicrofacetBsdf *bsdf)
 	if(is_zero(bsdf->T))
 		bsdf->T = make_float3(1.0f, 0.0f, 0.0f);
 
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
+	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_aniso_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	if(is_zero(bsdf->T))
+		bsdf->T = make_float3(1.0f, 0.0f, 0.0f);
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
 	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
 
@@ -363,6 +378,30 @@ ccl_device int bsdf_microfacet_multi_ggx_setup(MicrofacetBsdf *bsdf)
 {
 	bsdf->alpha_y = bsdf->alpha_x;
 
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
+	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->alpha_y = bsdf->alpha_x;
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
+	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
+}
+
+ccl_device int bsdf_microfacet_multi_ggx_refraction_setup(MicrofacetBsdf *bsdf)
+{
+	bsdf->alpha_y = bsdf->alpha_x;
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID;
+
 	return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
 
@@ -378,6 +417,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
 		return make_float3(0.0f, 0.0f, 0.0f);
 	}
 
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID);
+
 	bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y);
 	float3 X, Y, Z;
 	Z = bsdf->N;
@@ -393,7 +434,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
 		*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
 	else
 		*pdf = mf_ggx_pdf(localI, localO, bsdf->alpha_x);
-	return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL);
+	return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
 }
 
 ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
@@ -407,9 +448,15 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
 		*omega_in = 2*dot(Z, I)*Z - I;
 		*pdf = 1e6f;
 		*eval = make_float3(1e6f, 1e6f, 1e6f);
+#ifdef __RAY_DIFFERENTIALS__
+		*domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
+		*domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
+#endif
 		return LABEL_REFLECT|LABEL_SINGULAR;
 	}
 
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID);
+
 	bool is_aniso = (bsdf->alpha_x != bsdf->alpha_y);
 	if(is_aniso)
 		make_orthonormals_tangent(Z, bsdf->T, &X, &Y);
@@ -419,7 +466,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
 	float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
 	float3 localO;
 
-	*eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL);
+	*eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
 	if(is_aniso)
 		*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
 	else
@@ -427,6 +474,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
 	*eval *= *pdf;
 
 	*omega_in = X*localO.x + Y*localO.y + Z*localO.z;
+
 #ifdef __RAY_DIFFERENTIALS__
 	*domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
 	*domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
@@ -450,6 +498,27 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf)
 	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
 }
 
+ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(MicrofacetBsdf *bsdf, const ShaderData *sd)
+{
+	bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
+	bsdf->alpha_y = bsdf->alpha_x;
+	bsdf->ior = max(0.0f, bsdf->ior);
+	bsdf->extra->color.x = saturate(bsdf->extra->color.x);
+	bsdf->extra->color.y = saturate(bsdf->extra->color.y);
+	bsdf->extra->color.z = saturate(bsdf->extra->color.z);
+	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
+	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
+	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
+
+	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID;
+
+	float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
+	float F = average(interpolate_fresnel_color(sd->I, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0));
+	bsdf->sample_weight *= F;
+
+	return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSDF_NEEDS_LCG;
+}
+
 ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
 	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
 
@@ -465,7 +534,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClos
 	float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));
 
 	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
-	return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+	return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, false, bsdf->extra->color);
 }
 
 ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
@@ -475,6 +544,8 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
 		return make_float3(0.0f, 0.0f, 0.0f);
 	}
 
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID);
+
 	float3 X, Y, Z;
 	Z = bsdf->N;
 	make_orthonormals(Z, &X, &Y);
@@ -483,7 +554,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
 	float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));
 
 	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
-	return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+	return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
 }
 
 ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
@@ -525,12 +596,14 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const S
 		}
 	}
 
+	bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID);
+
 	make_orthonormals(Z, &X, &Y);
 
 	float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
 	float3 localO;
 
-	*eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+	*eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, use_fresnel, bsdf->extra->cspec0);
 	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
 	*eval *= *pdf;
 
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index 8054fa8e849..e73915dbda7 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -26,19 +26,16 @@
  * the balance heuristic isn't necessarily optimal anymore.
  */
 ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
-        float3 wi,
-        float3 wo,
-        const bool wo_outside,
-        const float3 color,
-        const float alpha_x,
-        const float alpha_y,
-         ccl_addr_space uint *lcg_state
-#ifdef MF_MULTI_GLASS
-        , const float eta
-#elif defined(MF_MULTI_GLOSSY)
-        , float3 *n, float3 *k
-#endif
-)
+	float3 wi,
+	float3 wo,
+	const bool wo_outside,
+	const float3 color,
+	const float alpha_x,
+	const float alpha_y,
+	ccl_addr_space uint *lcg_state,
+	const float eta,
+	bool use_fresnel,
+	const float3 cspec0)
 {
 	/* Evaluating for a shallower incoming direction produces less noise, and the properties of the BSDF guarantee reciprocity. */
 	bool swapped = false;
@@ -71,50 +68,57 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 
 	/* Analytically compute single scattering for lower noise. */
 	float3 eval;
+	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+	const float3 wh = normalize(wi+wo);
 #ifdef MF_MULTI_GLASS
 	eval = mf_eval_phase_glass(-wi, lambda_r, wo, wo_outside, alpha, eta);
 	if(wo_outside)
 		eval *= -lambda_r / (shadowing_lambda - lambda_r);
 	else
 		eval *= -lambda_r * beta(-lambda_r, shadowing_lambda+1.0f);
-#elif defined(MF_MULTI_DIFFUSE)
-	/* Diffuse has no special closed form for the single scattering bounce */
-	eval = make_float3(0.0f, 0.0f, 0.0f);
 #else /* MF_MULTI_GLOSSY */
-	const float3 wh = normalize(wi+wo);
 	const float G2 = 1.0f / (1.0f - (lambda_r + 1.0f) + shadowing_lambda);
 	float val = G2 * 0.25f / wi.z;
 	if(alpha.x == alpha.y)
 		val *= D_ggx(wh, alpha.x);
 	else
 		val *= D_ggx_aniso(wh, alpha);
-	if(n && k) {
-		eval = fresnel_conductor(dot(wh, wi), *n, *k) * val;
-	}
-	else {
-		eval = make_float3(val, val, val);
-	}
+	eval = make_float3(val, val, val);
 #endif
 
+	float F0 = fresnel_dielectric_cos(1.0f, eta);
+	if(use_fresnel) {
+		throughput = interpolate_fresnel_color(wi, wh, eta, F0, cspec0);
+
+		eval *= throughput;
+	}
+
 	float3 wr = -wi;
 	float hr = 1.0f;
 	float C1_r = 1.0f;
 	float G1_r = 0.0f;
 	bool outside = true;
-	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 
 	for(int order = 0; order < 10; order++) {
-		/* Sample microfacet height and normal */
-		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state)))
+		/* Sample microfacet height. */
+		float height_rand = lcg_step_float_addrspace(lcg_state);
+		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand))
 			break;
-		float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
-		                                                   lcg_step_float_addrspace(lcg_state)));
-
-#ifdef MF_MULTI_DIFFUSE
-		if(order == 0) {
-			/* Compute single-scattering for diffuse. */
-			const float G2_G1 = -lambda_r / (shadowing_lambda - lambda_r);
-			eval += throughput * G2_G1 * mf_eval_phase_diffuse(wo, wm);
+		/* Sample microfacet normal. */
+		float vndf_rand_y = lcg_step_float_addrspace(lcg_state);
+		float vndf_rand_x = lcg_step_float_addrspace(lcg_state);
+		float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y);
+
+#ifdef MF_MULTI_GLASS
+		if(order == 0 && use_fresnel) {
+			/* Evaluate amount of scattering towards wo on this microfacet. */
+			float3 phase;
+			if(outside)
+				phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta);
+			else
+				phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f / eta);
+
+			eval = throughput * phase * mf_G1(wo_outside ? wo : -wo, mf_C1((outside == wo_outside) ? hr : -hr), shadowing_lambda);
 		}
 #endif
 		if(order > 0) {
@@ -125,10 +129,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 				phase = mf_eval_phase_glass(wr, lambda_r,  wo,  wo_outside, alpha, eta);
 			else
 				phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f/eta);
-#elif defined(MF_MULTI_DIFFUSE)
-			phase = mf_eval_phase_diffuse(wo, wm);
 #else /* MF_MULTI_GLOSSY */
-			phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha, n, k) * throughput;
+			phase = mf_eval_phase_glossy(wr, lambda_r, wo, alpha) * throughput;
 #endif
 			eval += throughput * phase * mf_G1(wo_outside? wo: -wo, mf_C1((outside == wo_outside)? hr: -hr), shadowing_lambda);
 		}
@@ -136,23 +138,32 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 			/* Bounce from the microfacet. */
 #ifdef MF_MULTI_GLASS
 			bool next_outside;
-			wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside);
+			float3 wi_prev = -wr;
+			float phase_rand = lcg_step_float_addrspace(lcg_state);
+			wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside);
 			if(!next_outside) {
 				outside = !outside;
 				wr = -wr;
 				hr = -hr;
 			}
-#elif defined(MF_MULTI_DIFFUSE)
-			wr = mf_sample_phase_diffuse(wm,
-			                             lcg_step_float_addrspace(lcg_state),
-			                             lcg_step_float_addrspace(lcg_state));
+
+			if(use_fresnel && !next_outside) {
+				throughput *= color;
+			}
+			else if(use_fresnel && order > 0) {
+				throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
+			}
 #else /* MF_MULTI_GLOSSY */
-			wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm);
+			if(use_fresnel && order > 0) {
+				throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
+			}
+			wr = mf_sample_phase_glossy(-wr, &throughput, wm);
 #endif
 
 			lambda_r = mf_lambda(wr, alpha);
 
-			throughput *= color;
+			if(!use_fresnel)
+				throughput *= color;
 
 			C1_r = mf_C1(hr);
 			G1_r = mf_G1(wr, C1_r, lambda_r);
@@ -168,13 +179,16 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
  * escaped the surface in wo. The function returns the throughput between wi and wo.
  * Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal.
  */
-ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
-#ifdef MF_MULTI_GLASS
-	, const float eta
-#elif defined(MF_MULTI_GLOSSY)
-	, float3 *n, float3 *k
-#endif
-)
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(
+	float3 wi,
+	float3 *wo,
+	const float3 color,
+	const float alpha_x,
+	const float alpha_y,
+	ccl_addr_space uint *lcg_state,
+	const float eta,
+	bool use_fresnel,
+	const float3 cspec0)
 {
 	const float2 alpha = make_float2(alpha_x, alpha_y);
 
@@ -186,37 +200,64 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
 	float G1_r = 0.0f;
 	bool outside = true;
 
+	float F0 = fresnel_dielectric_cos(1.0f, eta);
+	if(use_fresnel) {
+		throughput = interpolate_fresnel_color(wi, normalize(wi + wr), eta, F0, cspec0);
+	}
+
 	int order;
 	for(order = 0; order < 10; order++) {
 		/* Sample microfacet height. */
-		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) {
+		float height_rand = lcg_step_float_addrspace(lcg_state);
+		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) {
 			/* The random walk has left the surface. */
 			*wo = outside? wr: -wr;
 			return throughput;
 		}
 		/* Sample microfacet normal. */
-		float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
-		                                                   lcg_step_float_addrspace(lcg_state)));
+		float vndf_rand_y = lcg_step_float_addrspace(lcg_state);
+		float vndf_rand_x = lcg_step_float_addrspace(lcg_state);
+		float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y);
 
 		/* First-bounce color is already accounted for in mix weight. */
-		if(order > 0)
+		if(!use_fresnel && order > 0)
 			throughput *= color;
 
 		/* Bounce from the microfacet. */
 #ifdef MF_MULTI_GLASS
 		bool next_outside;
-		wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside);
+		float3 wi_prev = -wr;
+		float phase_rand = lcg_step_float_addrspace(lcg_state);
+		wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside);
 		if(!next_outside) {
 			hr = -hr;
 			wr = -wr;
 			outside = !outside;
 		}
-#elif defined(MF_MULTI_DIFFUSE)
-		wr = mf_sample_phase_diffuse(wm,
-		                             lcg_step_float_addrspace(lcg_state),
-		                             lcg_step_float_addrspace(lcg_state));
+
+		if(use_fresnel) {
+			if(!next_outside) {
+				throughput *= color;
+			}
+			else {
+				float3 t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
+
+				if(order == 0)
+					throughput = t_color;
+				else
+					throughput *= t_color;
+			}
+		}
 #else /* MF_MULTI_GLOSSY */
-		wr = mf_sample_phase_glossy(-wr, n, k, &throughput, wm);
+		if(use_fresnel) {
+			float3 t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
+
+			if(order == 0)
+				throughput = t_color;
+			else
+				throughput *= t_color;
+		}
+		wr = mf_sample_phase_glossy(-wr, &throughput, wm);
 #endif
 
 		/* Update random walk parameters. */
@@ -228,6 +269,5 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3
 }
 
 #undef MF_MULTI_GLASS
-#undef MF_MULTI_DIFFUSE
 #undef MF_MULTI_GLOSSY
 #undef MF_PHASE_FUNCTION
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index cb342a026ef..6b770fc0c16 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct OrenNayarBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float roughness;
 	float a;
 	float b;
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index e152a8780db..420f94755ee 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct PhongRampBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float exponent;
 	float3 *colors;
 } PhongRampBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
new file mode 100644
index 00000000000..f8ca64293b0
--- /dev/null
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__
+#define __BSDF_PRINCIPLED_DIFFUSE_H__
+
+/* DISNEY PRINCIPLED DIFFUSE BRDF
+ *
+ * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
+ */
+
+CCL_NAMESPACE_BEGIN
+
+typedef ccl_addr_space struct PrincipledDiffuseBsdf {
+	SHADER_CLOSURE_BASE;
+
+	float roughness;
+} PrincipledDiffuseBsdf;
+
+ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf,
+	float3 N, float3 V, float3 L, float3 H, float *pdf)
+{
+	float NdotL = max(dot(N, L), 0.0f);
+	float NdotV = max(dot(N, V), 0.0f);
+
+	if(NdotL < 0 || NdotV < 0) {
+		*pdf = 0.0f;
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+
+	float LdotH = dot(L, H);
+
+	float FL = schlick_fresnel(NdotL), FV = schlick_fresnel(NdotV);
+	const float Fd90 = 0.5f + 2.0f * LdotH*LdotH * bsdf->roughness;
+	float Fd = (1.0f * (1.0f - FL) + Fd90 * FL) * (1.0f * (1.0f - FV) + Fd90 * FV);
+
+	float value = M_1_PI_F * NdotL * Fd;
+
+	return make_float3(value, value, value);
+}
+
+ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf)
+{
+	bsdf->type = CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
+{
+	const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf*)a;
+	const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf*)b;
+
+	return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness);
+}
+
+ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc, const float3 I,
+	const float3 omega_in, float *pdf)
+{
+	const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc;
+
+	float3 N = bsdf->N;
+	float3 V = I; // outgoing
+	float3 L = omega_in; // incoming
+	float3 H = normalize(L + V);
+
+	if(dot(N, omega_in) > 0.0f) {
+		*pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F;
+		return calculate_principled_diffuse_brdf(bsdf, N, V, L, H, pdf);
+	}
+	else {
+		*pdf = 0.0f;
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
+
+ccl_device float3 bsdf_principled_diffuse_eval_transmit(const ShaderClosure *sc, const float3 I,
+	const float3 omega_in, float *pdf)
+{
+	return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc,
+	float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv,
+	float3 *eval, float3 *omega_in, float3 *domega_in_dx,
+	float3 *domega_in_dy, float *pdf)
+{
+	const PrincipledDiffuseBsdf *bsdf = (const PrincipledDiffuseBsdf *)sc;
+
+	float3 N = bsdf->N;
+
+	sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
+
+	if(dot(Ng, *omega_in) > 0) {
+		float3 H = normalize(I + *omega_in);
+
+		*eval = calculate_principled_diffuse_brdf(bsdf, N, I, *omega_in, H, pdf);
+
+#ifdef __RAY_DIFFERENTIALS__
+		// TODO: find a better approximation for the diffuse bounce
+		*domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx);
+		*domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy);
+#endif
+	}
+	else {
+		*pdf = 0.0f;
+	}
+	return LABEL_REFLECT|LABEL_DIFFUSE;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */
+
+
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
new file mode 100644
index 00000000000..f4476bfecd0
--- /dev/null
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BSDF_PRINCIPLED_SHEEN_H__
+#define __BSDF_PRINCIPLED_SHEEN_H__
+
+/* DISNEY PRINCIPLED SHEEN BRDF
+ *
+ * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
+ */
+
+CCL_NAMESPACE_BEGIN
+
+typedef ccl_addr_space struct PrincipledSheenBsdf {
+	SHADER_CLOSURE_BASE;
+} PrincipledSheenBsdf;
+
+ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf,
+	float3 N, float3 V, float3 L, float3 H, float *pdf)
+{
+	float NdotL = dot(N, L);
+	float NdotV = dot(N, V);
+
+	if(NdotL < 0 || NdotV < 0) {
+		*pdf = 0.0f;
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+
+	float LdotH = dot(L, H);
+
+	float value = schlick_fresnel(LdotH) * NdotL;
+
+	return make_float3(value, value, value);
+}
+
+ccl_device int bsdf_principled_sheen_setup(PrincipledSheenBsdf *bsdf)
+{
+	bsdf->type = CLOSURE_BSDF_PRINCIPLED_SHEEN_ID;
+	return SD_BSDF|SD_BSDF_HAS_EVAL;
+}
+
+ccl_device float3 bsdf_principled_sheen_eval_reflect(const ShaderClosure *sc, const float3 I,
+	const float3 omega_in, float *pdf)
+{
+	const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc;
+
+	float3 N = bsdf->N;
+	float3 V = I; // outgoing
+	float3 L = omega_in; // incoming
+	float3 H = normalize(L + V);
+
+	if(dot(N, omega_in) > 0.0f) {
+		*pdf = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F;
+		return calculate_principled_sheen_brdf(bsdf, N, V, L, H, pdf);
+	}
+	else {
+		*pdf = 0.0f;
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
+
+ccl_device float3 bsdf_principled_sheen_eval_transmit(const ShaderClosure *sc, const float3 I,
+	const float3 omega_in, float *pdf)
+{
+	return make_float3(0.0f, 0.0f, 0.0f);
+}
+
+ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc,
+	float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv,
+	float3 *eval, float3 *omega_in, float3 *domega_in_dx,
+	float3 *domega_in_dy, float *pdf)
+{
+	const PrincipledSheenBsdf *bsdf = (const PrincipledSheenBsdf *)sc;
+
+	float3 N = bsdf->N;
+
+	sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
+
+	if(dot(Ng, *omega_in) > 0) {
+		float3 H = normalize(I + *omega_in);
+
+		*eval = calculate_principled_sheen_brdf(bsdf, N, I, *omega_in, H, pdf);
+
+#ifdef __RAY_DIFFERENTIALS__
+		// TODO: find a better approximation for the diffuse bounce
+		*domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx);
+		*domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy);
+#endif
+	}
+	else {
+		*pdf = 0.0f;
+	}
+	return LABEL_REFLECT|LABEL_DIFFUSE;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */
+
+
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index 28e775bcbc8..d8b6d8ddead 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct ToonBsdf {
 	SHADER_CLOSURE_BASE;
 
-	float3 N;
 	float size;
 	float smooth;
 } ToonBsdf;
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index b0c5280b6cb..3dc15d5791c 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -124,6 +124,13 @@ ccl_device float3 fresnel_conductor(float cosi, const float3 eta, const float3 k
 	return(Rparl2 + Rperp2) * 0.5f;
 }
 
+ccl_device float schlick_fresnel(float u)
+{
+	float m = clamp(1.0f - u, 0.0f, 1.0f);
+	float m2 = m * m;
+	return m2 * m2 * m; // pow(m, 5)
+}
+
 ccl_device float smooth_step(float edge0, float edge1, float x)
 {
 	float result;
@@ -136,6 +143,19 @@ ccl_device float smooth_step(float edge0, float edge1, float x)
 	return result;
 }
 
+/* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */
+ccl_device_forceinline float3 interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0) {
+	/* Calculate the fresnel interpolation factor
+	 * The value from fresnel_dielectric_cos(...) has to be normalized because
+	 * the cspec0 keeps the F0 color
+	*/
+	float F0_norm = 1.0f / (1.0f - F0);
+	float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm;
+
+	/* Blend between white and a specular color with respect to the fresnel */
+	return cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH;
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __BSDF_UTIL_H__ */
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index af0bbd861a9..267aeea6e86 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -27,7 +27,7 @@ typedef ccl_addr_space struct Bssrdf {
 	float d;
 	float texture_blur;
 	float albedo;
-	float3 N;
+	float roughness;
 } Bssrdf;
 
 /* Planar Truncated Gaussian
@@ -348,8 +348,9 @@ ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
 {
 	Bssrdf *bssrdf = (Bssrdf*)closure_alloc(sd, sizeof(Bssrdf), CLOSURE_NONE_ID, weight);
 
-	if(!bssrdf)
+	if(bssrdf == NULL) {
 		return NULL;
+	}
 
 	float sample_weight = fabsf(average(weight));
 	bssrdf->sample_weight = sample_weight;
@@ -360,10 +361,32 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type)
 {
 	if(bssrdf->radius < BSSRDF_MIN_RADIUS) {
 		/* revert to diffuse BSDF if radius too small */
-		DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf;
-		bsdf->N = bssrdf->N;
-		int flag = bsdf_diffuse_setup(bsdf);
-		bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+		int flag;
+#ifdef __PRINCIPLED__
+		if(type == CLOSURE_BSSRDF_PRINCIPLED_ID) {
+			float roughness = bssrdf->roughness;
+			float3 N = bssrdf->N;
+			float3 weight = bssrdf->weight;
+			float sample_weight = bssrdf->sample_weight;
+
+			PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bssrdf;
+
+			bsdf->N = N;
+			bsdf->roughness = roughness;
+			bsdf->weight = weight;
+			bsdf->sample_weight = sample_weight;
+			flag = bsdf_principled_diffuse_setup(bsdf);
+			bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+		}
+		else
+#endif  /* __PRINCIPLED__ */
+		{
+			DiffuseBsdf *bsdf = (DiffuseBsdf*)bssrdf;
+			bsdf->N = bssrdf->N;
+			flag = bsdf_diffuse_setup(bsdf);
+			bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+		}
+		
 		return flag;
 	}
 	else {
@@ -371,11 +394,13 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type)
 		bssrdf->sharpness = saturate(bssrdf->sharpness);
 		bssrdf->type = type;
 
-		if(type == CLOSURE_BSSRDF_BURLEY_ID) {
+		if(type == CLOSURE_BSSRDF_BURLEY_ID ||
+		   type == CLOSURE_BSSRDF_PRINCIPLED_ID)
+		{
 			bssrdf_burley_setup(bssrdf);
 		}
 
-		return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF;
+		return SD_BSSRDF;
 	}
 }
 
@@ -385,7 +410,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float
 		bssrdf_cubic_sample(sc, xi, r, h);
 	else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID)
 		bssrdf_gaussian_sample(sc, xi, r, h);
-	else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/
+	else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID || sc->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
 		bssrdf_burley_sample(sc, xi, r, h);
 }
 
@@ -395,7 +420,7 @@ ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
 		return bssrdf_cubic_pdf(sc, r);
 	else if(sc->type == CLOSURE_BSSRDF_GAUSSIAN_ID)
 		return bssrdf_gaussian_pdf(sc, r);
-	else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID)*/
+	else /*if(sc->type == CLOSURE_BSSRDF_BURLEY_ID || sc->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
 		return bssrdf_burley_pdf(sc, r);
 }
 
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
new file mode 100644
index 00000000000..f6e474d6702
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_H__
+#define __FILTER_H__
+
+/* CPU Filter Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
+#define KERNEL_NAME_EVAL(arch, name)  KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+CCL_NAMESPACE_END
+
+#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
new file mode 100644
index 00000000000..ce96f733aff
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_DEFINES_H__
+#define __FILTER_DEFINES_H__
+
+#define DENOISE_FEATURES 10
+#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
+#define XTWX_SIZE      (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
+#define XTWY_SIZE      (DENOISE_FEATURES+1)
+
+typedef struct TilesInfo {
+	int offsets[9];
+	int strides[9];
+	int x[4];
+	int y[4];
+	/* TODO(lukas): CUDA doesn't have uint64_t... */
+#ifdef __KERNEL_OPENCL__
+	ccl_global float *buffers[9];
+#else
+	long long int buffers[9];
+#endif
+} TilesInfo;
+
+#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
new file mode 100644
index 00000000000..6226ed2c2ef
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
+ * pixel_buffer always points to the current pixel in the first pass. */
+#define FOR_PIXEL_WINDOW     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+                             for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+                                 for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+
+#define END_FOR_PIXEL_WINDOW     } \
+                                 pixel_buffer += buffer_w - (high.x - low.x); \
+                             }
+
+ccl_device_inline void filter_get_features(int2 pixel,
+                                           const ccl_global float *ccl_restrict buffer,
+                                           float *features,
+                                           const float *ccl_restrict mean,
+                                           int pass_stride)
+{
+	features[0] = pixel.x;
+	features[1] = pixel.y;
+	features[2] = fabsf(ccl_get_feature(buffer, 0));
+	features[3] = ccl_get_feature(buffer, 1);
+	features[4] = ccl_get_feature(buffer, 2);
+	features[5] = ccl_get_feature(buffer, 3);
+	features[6] = ccl_get_feature(buffer, 4);
+	features[7] = ccl_get_feature(buffer, 5);
+	features[8] = ccl_get_feature(buffer, 6);
+	features[9] = ccl_get_feature(buffer, 7);
+	if(mean) {
+		for(int i = 0; i < DENOISE_FEATURES; i++)
+			features[i] -= mean[i];
+	}
+}
+
+ccl_device_inline void filter_get_feature_scales(int2 pixel,
+                                                 const ccl_global float *ccl_restrict buffer,
+                                                 float *scales,
+                                                 const float *ccl_restrict mean,
+                                                 int pass_stride)
+{
+	scales[0] = fabsf(pixel.x - mean[0]);
+	scales[1] = fabsf(pixel.y - mean[1]);
+	scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
+	scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
+	                                    ccl_get_feature(buffer, 2) - mean[4],
+	                                    ccl_get_feature(buffer, 3) - mean[5]));
+	scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
+	scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
+	                                    ccl_get_feature(buffer, 6) - mean[8],
+	                                    ccl_get_feature(buffer, 7) - mean[9]));
+}
+
+ccl_device_inline void filter_calculate_scale(float *scale)
+{
+	scale[0] = 1.0f/max(scale[0], 0.01f);
+	scale[1] = 1.0f/max(scale[1], 0.01f);
+	scale[2] = 1.0f/max(scale[2], 0.01f);
+	scale[6] = 1.0f/max(scale[4], 0.01f);
+	scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
+	scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
+}
+
+ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
+                                          int pass_stride)
+{
+	return make_float3(ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
+}
+
+ccl_device_inline void design_row_add(float *design_row,
+                                      int rank,
+                                      const ccl_global float *ccl_restrict transform,
+                                      int stride,
+                                      int row,
+                                      float feature)
+{
+	for(int i = 0; i < rank; i++) {
+		design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+	}
+}
+
+/* Fill the design row. */
+ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+                                                       const ccl_global float *ccl_restrict p_buffer,
+                                                       int2 q_pixel,
+                                                       const ccl_global float *ccl_restrict q_buffer,
+                                                       int pass_stride,
+                                                       int rank,
+                                                       float *design_row,
+                                                       const ccl_global float *ccl_restrict transform,
+                                                       int stride)
+{
+	design_row[0] = 1.0f;
+	math_vector_zero(design_row+1, rank);
+	design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
+	design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
+	design_row_add(design_row, rank, transform, stride, 2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
+	design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+	design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+	design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+	design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+	design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+	design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+	design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
new file mode 100644
index 00000000000..3ddd8712266
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride)
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
+ * pixel_buffer always points to the first of the 4 current pixel in the first pass.
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
+
+#define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+                                 for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+                                     float4 y4 = make_float4(pixel.y); \
+                                     for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+                                         float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+                                         int4 active_pixels = x4 < make_float4(high.x);
+
+#define END_FOR_PIXEL_WINDOW_SSE     } \
+                                     pixel_buffer += buffer_w - (pixel.x - low.x); \
+                                 }
+
+ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
+                                               int4 active_pixels,
+                                               const float *ccl_restrict buffer,
+                                               float4 *features,
+                                               const float4 *ccl_restrict mean,
+                                               int pass_stride)
+{
+	features[0] = x;
+	features[1] = y;
+	features[2] = fabs(ccl_get_feature_sse(0));
+	features[3] = ccl_get_feature_sse(1);
+	features[4] = ccl_get_feature_sse(2);
+	features[5] = ccl_get_feature_sse(3);
+	features[6] = ccl_get_feature_sse(4);
+	features[7] = ccl_get_feature_sse(5);
+	features[8] = ccl_get_feature_sse(6);
+	features[9] = ccl_get_feature_sse(7);
+	if(mean) {
+		for(int i = 0; i < DENOISE_FEATURES; i++)
+			features[i] = features[i] - mean[i];
+	}
+	for(int i = 0; i < DENOISE_FEATURES; i++)
+		features[i] = mask(active_pixels, features[i]);
+}
+
+ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
+                                                     int4 active_pixels,
+                                                     const float *ccl_restrict buffer,
+                                                     float4 *scales,
+                                                     const float4 *ccl_restrict mean,
+                                                     int pass_stride)
+{
+	scales[0] = fabs(x - mean[0]);
+	scales[1] = fabs(y - mean[1]);
+	scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
+	scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) +
+	            sqr(ccl_get_feature_sse(2) - mean[4]) +
+	            sqr(ccl_get_feature_sse(3) - mean[5]);
+	scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
+	scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
+	            sqr(ccl_get_feature_sse(6) - mean[8]) +
+	            sqr(ccl_get_feature_sse(7) - mean[9]);
+	for(int i = 0; i < 6; i++)
+		scales[i] = mask(active_pixels, scales[i]);
+}
+
+ccl_device_inline void filter_calculate_scale_sse(float4 *scale)
+{
+	scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
+	scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
+	scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+	scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
+	scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
+	scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h
new file mode 100644
index 00000000000..2ef03dc0a02
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_kernel.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/util_color.h"
+#include "util/util_math.h"
+#include "util/util_math_fast.h"
+#include "util/util_texture.h"
+
+#include "util/util_atomic.h"
+#include "util/util_math_matrix.h"
+
+#include "kernel/filter/filter_defines.h"
+
+#include "kernel/filter/filter_features.h"
+#ifdef __KERNEL_SSE3__
+#  include "kernel/filter/filter_features_sse.h"
+#endif
+
+#include "kernel/filter/filter_prefilter.h"
+
+#ifdef __KERNEL_GPU__
+#  include "kernel/filter/filter_transform_gpu.h"
+#else
+#  ifdef __KERNEL_SSE3__
+#    include "kernel/filter/filter_transform_sse.h"
+#  else
+#    include "kernel/filter/filter_transform.h"
+#  endif
+#endif
+
+#include "kernel/filter/filter_reconstruction.h"
+
+#ifdef __KERNEL_CPU__
+#  include "kernel/filter/filter_nlm_cpu.h"
+#else
+#  include "kernel/filter/filter_nlm_gpu.h"
+#endif
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
new file mode 100644
index 00000000000..5e989331bc2
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
+                                                         const float *ccl_restrict weight_image,
+                                                         const float *ccl_restrict variance_image,
+                                                         float *difference_image,
+                                                         int4 rect,
+                                                         int w,
+                                                         int channel_offset,
+                                                         float a,
+                                                         float k_2)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			float diff = 0.0f;
+			int numChannels = channel_offset? 3 : 1;
+			for(int c = 0; c < numChannels; c++) {
+				float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
+				float pvar = variance_image[c*channel_offset + y*w+x];
+				float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
+				diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+			}
+			if(numChannels > 1) {
+				diff *= 1.0f/numChannels;
+			}
+			difference_image[y*w+x] = diff;
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image,
+                                              float *out_image,
+                                              int4 rect,
+                                              int w,
+                                              int f)
+{
+	int aligned_lowx = rect.x / 4;
+	int aligned_highx = (rect.z + 3) / 4;
+	for(int y = rect.y; y < rect.w; y++) {
+		const int low = max(rect.y, y-f);
+		const int high = min(rect.w, y+f+1);
+		for(int x = rect.x; x < rect.z; x++) {
+			out_image[y*w+x] = 0.0f;
+		}
+		for(int y1 = low; y1 < high; y1++) {
+			float4* out_image4 = (float4*)(out_image + y*w);
+			float4* difference_image4 = (float4*)(difference_image + y1*w);
+			for(int x = aligned_lowx; x < aligned_highx; x++) {
+				out_image4[x] += difference_image4[x];
+			}
+		}
+		for(int x = rect.x; x < rect.z; x++) {
+			out_image[y*w+x] *= 1.0f/(high - low);
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
+                                                     float *out_image,
+                                                     int4 rect,
+                                                     int w,
+                                                     int f)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			out_image[y*w+x] = 0.0f;
+		}
+	}
+	for(int dx = -f; dx <= f; dx++) {
+		int pos_dx = max(0, dx);
+		int neg_dx = min(0, dx);
+		for(int y = rect.y; y < rect.w; y++) {
+			for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
+				out_image[y*w+x] += difference_image[y*w+dx+x];
+			}
+		}
+	}
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
+                                                       const float *ccl_restrict difference_image,
+                                                       const float *ccl_restrict image,
+                                                       float *out_image,
+                                                       float *accum_image,
+                                                       int4 rect,
+                                                       int w,
+                                                       int f)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			float sum = 0.0f;
+			for(int x1 = low; x1 < high; x1++) {
+				sum += difference_image[y*w+x1];
+			}
+			float weight = sum * (1.0f/(high - low));
+			accum_image[y*w+x] += weight;
+			out_image[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
+                                                           const float *ccl_restrict difference_image,
+                                                           const float *ccl_restrict buffer,
+                                                           float *transform,
+                                                           int *rank,
+                                                           float *XtWX,
+                                                           float3 *XtWY,
+                                                           int4 rect,
+                                                           int4 filter_rect,
+                                                           int w, int h, int f,
+                                                           int pass_stride)
+{
+	/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
+	for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
+		int y = fy + filter_rect.y;
+		for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
+			int x = fx + filter_rect.x;
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			float sum = 0.0f;
+			for(int x1 = low; x1 < high; x1++) {
+				sum += difference_image[y*w+x1];
+			}
+			float weight = sum * (1.0f/(high - low));
+
+			int storage_ofs = fy*filter_rect.z + fx;
+			float  *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
+			float  *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
+			float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
+			int    *l_rank = rank + storage_ofs;
+
+			kernel_filter_construct_gramian(x, y, 1,
+			                                dx, dy, w, h,
+			                                pass_stride,
+			                                buffer,
+			                                l_transform, l_rank,
+			                                weight, l_XtWX, l_XtWY, 0);
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
+                                                   const float *ccl_restrict accum_image,
+                                                   int4 rect,
+                                                   int w)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			out_image[y*w+x] /= accum_image[y*w+x];
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
new file mode 100644
index 00000000000..2c5ac807051
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
+                                                         int dx, int dy,
+                                                         const ccl_global float *ccl_restrict weight_image,
+                                                         const ccl_global float *ccl_restrict variance_image,
+                                                         ccl_global float *difference_image,
+                                                         int4 rect, int w,
+                                                         int channel_offset,
+                                                         float a, float k_2)
+{
+	float diff = 0.0f;
+	int numChannels = channel_offset? 3 : 1;
+	for(int c = 0; c < numChannels; c++) {
+		float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
+		float pvar = variance_image[c*channel_offset + y*w+x];
+		float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
+		diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+	}
+	if(numChannels > 1) {
+		diff *= 1.0f/numChannels;
+	}
+	difference_image[y*w+x] = diff;
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
+                                              const ccl_global float *ccl_restrict difference_image,
+                                              ccl_global float *out_image,
+                                              int4 rect, int w, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.y, y-f);
+	const int high = min(rect.w, y+f+1);
+	for(int y1 = low; y1 < high; y1++) {
+		sum += difference_image[y1*w+x];
+	}
+	sum *= 1.0f/(high-low);
+	out_image[y*w+x] = sum;
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
+                                                     const ccl_global float *ccl_restrict difference_image,
+                                                     ccl_global float *out_image,
+                                                     int4 rect, int w, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	for(int x1 = low; x1 < high; x1++) {
+		sum += difference_image[y*w+x1];
+	}
+	sum *= 1.0f/(high-low);
+	out_image[y*w+x] = fast_expf(-max(sum, 0.0f));
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
+                                                       int dx, int dy,
+                                                       const ccl_global float *ccl_restrict difference_image,
+                                                       const ccl_global float *ccl_restrict image,
+                                                       ccl_global float *out_image,
+                                                       ccl_global float *accum_image,
+                                                       int4 rect, int w, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	for(int x1 = low; x1 < high; x1++) {
+		sum += difference_image[y*w+x1];
+	}
+	sum *= 1.0f/(high-low);
+	if(out_image) {
+		accum_image[y*w+x] += sum;
+		out_image[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
+	}
+	else {
+		accum_image[y*w+x] = sum;
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
+                                                           int dx, int dy,
+                                                           const ccl_global float *ccl_restrict difference_image,
+                                                           const ccl_global float *ccl_restrict buffer,
+                                                           const ccl_global float *ccl_restrict transform,
+                                                           ccl_global int *rank,
+                                                           ccl_global float *XtWX,
+                                                           ccl_global float3 *XtWY,
+                                                           int4 rect,
+                                                           int4 filter_rect,
+                                                           int w, int h, int f,
+                                                           int pass_stride,
+                                                           int localIdx)
+{
+	int y = fy + filter_rect.y;
+	int x = fx + filter_rect.x;
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	float sum = 0.0f;
+	for(int x1 = low; x1 < high; x1++) {
+		sum += difference_image[y*w+x1];
+	}
+	float weight = sum * (1.0f/(high - low));
+
+	int storage_ofs = fy*filter_rect.z + fx;
+	transform += storage_ofs;
+	rank += storage_ofs;
+	XtWX += storage_ofs;
+	XtWY += storage_ofs;
+
+	kernel_filter_construct_gramian(x, y,
+	                                filter_rect.z*filter_rect.w,
+	                                dx, dy, w, h,
+	                                pass_stride,
+	                                buffer,
+	                                transform, rank,
+	                                weight, XtWX, XtWY,
+	                                localIdx);
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
+                                                   ccl_global float *out_image,
+                                                   const ccl_global float *ccl_restrict accum_image,
+                                                   int4 rect, int w)
+{
+	out_image[y*w+x] /= accum_image[y*w+x];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
new file mode 100644
index 00000000000..eefcbfea230
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* First step of the shadow prefiltering, performs the shadow division and stores all data
+ * in a nice and easy rectangular array that can be passed to the NLM filter.
+ *
+ * Calculates:
+ * unfiltered: Contains the two half images of the shadow feature pass
+ * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
+ * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
+ * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
+ */
+ccl_device void kernel_filter_divide_shadow(int sample,
+                                            ccl_global TilesInfo *tiles,
+                                            int x, int y,
+                                            ccl_global float *unfilteredA,
+                                            ccl_global float *unfilteredB,
+                                            ccl_global float *sampleVariance,
+                                            ccl_global float *sampleVarianceV,
+                                            ccl_global float *bufferVariance,
+                                            int4 rect,
+                                            int buffer_pass_stride,
+                                            int buffer_denoising_offset)
+{
+	int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+	int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+	int tile = ytile*3+xtile;
+
+	int offset = tiles->offsets[tile];
+	int stride = tiles->strides[tile];
+	const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tiles->buffers[tile];
+	center_buffer += (y*stride + x + offset)*buffer_pass_stride;
+	center_buffer += buffer_denoising_offset + 14;
+
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+	unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
+	unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
+
+	float varA = center_buffer[2];
+	float varB = center_buffer[5];
+	int odd_sample = (sample+1)/2;
+	int even_sample = sample/2;
+
+	/* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+	 * update does not work efficiently with atomics in the kernel. */
+	varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
+	varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
+
+	varA /= max(odd_sample - 1, 1);
+	varB /= max(even_sample - 1, 1);
+
+	sampleVariance[idx]  = 0.5f*(varA + varB) / sample;
+	sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
+	bufferVariance[idx]  = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
+}
+
+/* Load a regular feature from the render buffers into the denoise buffer.
+ * Parameters:
+ * - sample: The sample amount in the buffer, used to normalize the buffer.
+ * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
+ * - x, y: Current pixel
+ * - mean, variance: Target denoise buffers.
+ * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
+ */
+ccl_device void kernel_filter_get_feature(int sample,
+                                          ccl_global TilesInfo *tiles,
+                                          int m_offset, int v_offset,
+                                          int x, int y,
+                                          ccl_global float *mean,
+                                          ccl_global float *variance,
+                                          int4 rect, int buffer_pass_stride,
+                                          int buffer_denoising_offset)
+{
+	int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+	int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+	int tile = ytile*3+xtile;
+	ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
+
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+	mean[idx] = center_buffer[m_offset] / sample;
+	if(sample > 1) {
+		/* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+		 * update does not work efficiently with atomics in the kernel. */
+		variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+	}
+	else {
+		/* Can't compute variance with single sample, just set it very high. */
+		variance[idx] = 1e10f;
+	}
+}
+
+ccl_device void kernel_filter_detect_outliers(int x, int y,
+                                              ccl_global float *image,
+                                              ccl_global float *variance,
+                                              ccl_global float *depth,
+                                              ccl_global float *out,
+                                              int4 rect,
+                                              int pass_stride)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+	int idx = (y-rect.y)*buffer_w + (x-rect.x);
+	float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]);
+
+	float fac = 1.0f;
+	if(color.x < 0.0f || color.y < 0.0f || color.z < 0.0f) {
+		depth[idx] = -depth[idx];
+		fac = 0.0f;
+	}
+	else {
+		float L = average(color);
+		int n = 0;
+		float values[25];
+		for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
+			for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
+				int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
+				float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+
+				/* Find the position of L. */
+				int i;
+				for(i = 0; i < n; i++) {
+					if(values[i] > L) break;
+				}
+				/* Make space for L by shifting all following values to the right. */
+				for(int j = n; j > i; j--) {
+					values[j] = values[j-1];
+				}
+				/* Insert L. */
+				values[i] = L;
+				n++;
+			}
+		}
+
+		float ref = 2.0f*values[(int)(n*0.75f)];
+		if(L > ref) {
+			/* The pixel appears to be an outlier.
+			 * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
+			 * should actually be at the reference value:
+			 * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
+			 * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
+			 */
+			float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride])));
+			if(L - 3*stddev < ref) {
+				/* The pixel is an outlier, so negate the depth value to mark it as one.
+				 * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+				depth[idx] = -depth[idx];
+				fac = ref/L;
+				variance[idx              ] *= fac*fac;
+				variance[idx + pass_stride] *= fac*fac;
+				variance[idx+2*pass_stride] *= fac*fac;
+			}
+		}
+	}
+	out[idx              ] = fac*image[idx];
+	out[idx + pass_stride] = fac*image[idx + pass_stride];
+	out[idx+2*pass_stride] = fac*image[idx+2*pass_stride];
+}
+
+/* Combine A/B buffers.
+ * Calculates the combined mean and the buffer variance. */
+ccl_device void kernel_filter_combine_halves(int x, int y,
+                                             ccl_global float *mean,
+                                             ccl_global float *variance,
+                                             ccl_global float *a,
+                                             ccl_global float *b,
+                                             int4 rect, int r)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+	if(mean)     mean[idx] = 0.5f * (a[idx]+b[idx]);
+	if(variance) {
+		if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
+		else {
+			variance[idx] = 0.0f;
+			float values[25];
+			int numValues = 0;
+			for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
+				for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
+					int pidx = (py-rect.y)*buffer_w + (px-rect.x);
+					values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
+				}
+			}
+			/* Insertion-sort the variances (fast enough for 25 elements). */
+			for(int i = 1; i < numValues; i++) {
+				float v = values[i];
+				int j;
+				for(j = i-1; j >= 0 && values[j] > v; j--)
+					values[j+1] = values[j];
+				values[j+1] = v;
+			}
+			variance[idx] = values[(7*numValues)/8];
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
new file mode 100644
index 00000000000..25a3025056c
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
+                                                       int storage_stride,
+                                                       int dx, int dy,
+                                                       int w, int h,
+                                                       int pass_stride,
+                                                       const ccl_global float *ccl_restrict buffer,
+                                                       const ccl_global float *ccl_restrict transform,
+                                                       ccl_global int *rank,
+                                                       float weight,
+                                                       ccl_global float *XtWX,
+                                                       ccl_global float3 *XtWY,
+                                                       int localIdx)
+{
+	if(weight < 1e-3f) {
+		return;
+	}
+
+	int p_offset =  y    *w +  x;
+	int q_offset = (y+dy)*w + (x+dx);
+
+#ifdef __KERNEL_GPU__
+	const int stride = storage_stride;
+#else
+	const int stride = 1;
+	(void) storage_stride;
+#endif
+
+#ifdef __KERNEL_CUDA__
+	ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
+	ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
+#else
+	float design_row[DENOISE_FEATURES+1];
+#endif
+
+	float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
+
+	/* If the pixel was flagged as an outlier during prefiltering, skip it. */
+	if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
+		return;
+	}
+
+	filter_get_design_row_transform(make_int2(x, y),       buffer + p_offset,
+	                                make_int2(x+dx, y+dy), buffer + q_offset,
+	                                pass_stride, *rank, design_row, transform, stride);
+
+	math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
+	math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
+}
+
+ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
+                                              ccl_global float *buffer,
+                                              ccl_global int *rank,
+                                              int storage_stride,
+                                              ccl_global float *XtWX,
+                                              ccl_global float3 *XtWY,
+                                              int4 buffer_params,
+                                              int sample)
+{
+#ifdef __KERNEL_GPU__
+	const int stride = storage_stride;
+#else
+	const int stride = 1;
+	(void) storage_stride;
+#endif
+
+	if(XtWX[0] < 1e-3f) {
+		/* There is not enough information to determine a denoised result.
+		 * As a fallback, keep the original value of the pixel. */
+		 return;
+	}
+
+	/* The weighted average of pixel colors (essentially, the NLM-filtered image).
+	 * In case the solution of the linear model fails due to numerical issues,
+	 * fall back to this value. */
+	float3 mean_color = XtWY[0]/XtWX[0];
+
+	math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
+
+	float3 final_color = XtWY[0];
+	if(!isfinite3_safe(final_color)) {
+		final_color = mean_color;
+	}
+
+	/* Clamp pixel value to positive values. */
+	final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
+
+	ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+	final_color *= sample;
+	if(buffer_params.w) {
+		final_color.x += combined_buffer[buffer_params.w+0];
+		final_color.y += combined_buffer[buffer_params.w+1];
+		final_color.z += combined_buffer[buffer_params.w+2];
+	}
+	combined_buffer[0] = final_color.x;
+	combined_buffer[1] = final_color.y;
+	combined_buffer[2] = final_color.z;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
new file mode 100644
index 00000000000..a5f87c05ec0
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  float *transform, int *rank,
+                                                  int radius, float pca_threshold)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+	float features[DENOISE_FEATURES];
+
+	/* Temporary storage, used in different steps of the algorithm. */
+	float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	float tempvector[2*DENOISE_FEATURES];
+	const float *ccl_restrict pixel_buffer;
+	int2 pixel;
+
+	/* === Calculate denoising window. === */
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+	int num_pixels = (high.y - low.y) * (high.x - low.x);
+
+	/* === Shift feature passes to have mean 0. === */
+	float feature_means[DENOISE_FEATURES];
+	math_vector_zero(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add(feature_means, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+
+	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+	float *feature_scale = tempvector;
+	math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+	FOR_PIXEL_WINDOW {
+		filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_max(feature_scale, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	filter_calculate_scale(feature_scale);
+
+	/* === Generate the feature transformation. ===
+	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+	float* feature_matrix = tempmatrix;
+	math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+		math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+	} END_FOR_PIXEL_WINDOW
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+	*rank = 0;
+	/* Prevent overfitting when a small window is used. */
+	int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+		}
+	}
+	else {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+		}
+	}
+
+	/* Bake the feature scaling into the transformation matrix. */
+	for(int i = 0; i < (*rank); i++) {
+		math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+	}
+	math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
new file mode 100644
index 00000000000..83a1222bbdb
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  ccl_global float *transform,
+                                                  ccl_global int *rank,
+                                                  int radius, float pca_threshold,
+                                                  int transform_stride, int localIdx)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+#ifdef __KERNEL_CUDA__
+	ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
+	ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+#else
+	float features[DENOISE_FEATURES];
+#endif
+
+	/* === Calculate denoising window. === */
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+	int num_pixels = (high.y - low.y) * (high.x - low.x);
+	const ccl_global float *ccl_restrict pixel_buffer;
+	int2 pixel;
+
+
+
+
+	/* === Shift feature passes to have mean 0. === */
+	float feature_means[DENOISE_FEATURES];
+	math_vector_zero(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add(feature_means, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	math_vector_scale(feature_means, 1.0f / num_pixels, DENOISE_FEATURES);
+
+	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+	float feature_scale[DENOISE_FEATURES];
+	math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+	FOR_PIXEL_WINDOW {
+		filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_max(feature_scale, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	filter_calculate_scale(feature_scale);
+
+
+
+	/* === Generate the feature transformation. ===
+	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+		math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+	} END_FOR_PIXEL_WINDOW
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+	*rank = 0;
+	/* Prevent overfitting when a small window is used. */
+	int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+		}
+	}
+	else {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+		}
+	}
+
+	math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
+
+	/* Bake the feature scaling into the transformation matrix. */
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		for(int j = 0; j < (*rank); j++) {
+			transform[(i*DENOISE_FEATURES + j)*transform_stride] *= feature_scale[i];
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
new file mode 100644
index 00000000000..9e65f61664b
--- /dev/null
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  float *transform, int *rank,
+                                                  int radius, float pca_threshold)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+	float4 features[DENOISE_FEATURES];
+	const float *ccl_restrict pixel_buffer;
+	int2 pixel;
+
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+	int num_pixels = (high.y - low.y) * (high.x - low.x);
+
+	float4 feature_means[DENOISE_FEATURES];
+	math_vector_zero_sse(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	float4 pixel_scale = make_float4(1.0f / num_pixels);
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
+	}
+
+	float4 feature_scale[DENOISE_FEATURES];
+	math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	filter_calculate_scale_sse(feature_scale);
+
+	float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
+		math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f));
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+
+	*rank = 0;
+	/* Prevent overfitting when a small window is used. */
+	int max_rank = min(DENOISE_FEATURES, num_pixels/3);
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+		}
+	}
+	else {
+		for(int i = 0; i < max_rank; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+		}
+	}
+
+	math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+
+	/* Bake the feature scaling into the transformation matrix. */
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index 24ced934c8b..f34b77ebc07 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -14,17 +14,20 @@
  * limitations under the License.
  */
 
-#include "geom_attribute.h"
-#include "geom_object.h"
+#include "kernel/geom/geom_attribute.h"
+#include "kernel/geom/geom_object.h"
 #ifdef __PATCH_EVAL__
-#  include "geom_patch.h"
+#  include "kernel/geom/geom_patch.h"
 #endif
-#include "geom_triangle.h"
-#include "geom_subd_triangle.h"
-#include "geom_triangle_intersect.h"
-#include "geom_motion_triangle.h"
-#include "geom_motion_curve.h"
-#include "geom_curve.h"
-#include "geom_volume.h"
-#include "geom_primitive.h"
+#include "kernel/geom/geom_triangle.h"
+#include "kernel/geom/geom_subd_triangle.h"
+#include "kernel/geom/geom_triangle_intersect.h"
+#include "kernel/geom/geom_motion_triangle.h"
+#include "kernel/geom/geom_motion_triangle_intersect.h"
+#include "kernel/geom/geom_motion_triangle_shader.h"
+#include "kernel/geom/geom_motion_curve.h"
+#include "kernel/geom/geom_curve.h"
+#include "kernel/geom/geom_curve_intersect.h"
+#include "kernel/geom/geom_volume.h"
+#include "kernel/geom/geom_primitive.h"
 
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index 08ccee56335..cc62192ef21 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -30,7 +30,7 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *
 ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return ATTR_PRIM_CURVE;
 	}
 	else
@@ -53,12 +53,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found()
 
 ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id)
 {
-	if(ccl_fetch(sd, object) == PRIM_NONE) {
+	if(sd->object == PRIM_NONE) {
 		return attribute_not_found();
 	}
 
 	/* for SVM, find attribute by unique id */
-	uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
+	uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
 	attr_offset += attribute_primitive_type(kg, sd);
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	
@@ -73,7 +73,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const Sh
 	AttributeDescriptor desc;
 	desc.element = (AttributeElement)attr_map.y;
 	
-	if(ccl_fetch(sd, prim) == PRIM_NONE &&
+	if(sd->prim == PRIM_NONE &&
 	   desc.element != ATTR_ELEMENT_MESH &&
 	   desc.element != ATTR_ELEMENT_VOXEL &&
 	   desc.element != ATTR_ELEMENT_OBJECT)
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 84aaaab7453..e35267f02bf 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -16,9 +16,10 @@ CCL_NAMESPACE_BEGIN
 
 /* Curve Primitive
  *
- * Curve primitive for rendering hair and fur. These can be render as flat ribbons
- * or curves with actual thickness. The curve can also be rendered as line segments
- * rather than curves for better performance */
+ * Curve primitive for rendering hair and fur. These can be render as flat
+ * ribbons or curves with actual thickness. The curve can also be rendered as
+ * line segments rather than curves for better performance.
+ */
 
 #ifdef __HAIR__
 
@@ -32,22 +33,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 		if(dy) *dy = 0.0f;
 #endif
 
-		return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
+		return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
 	}
 	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float f0 = kernel_tex_fetch(__attributes_float, desc.offset + k0);
 		float f1 = kernel_tex_fetch(__attributes_float, desc.offset + k1);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
+		if(dx) *dx = sd->du.dx*(f1 - f0);
 		if(dy) *dy = 0.0f;
 #endif
 
-		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
+		return (1.0f - sd->u)*f0 + sd->u*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -71,22 +72,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim));
 	}
 	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
+		if(dx) *dx = sd->du.dx*(f1 - f0);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
+		return (1.0f - sd->u)*f0 + sd->u*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -104,22 +105,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 {
 	float r = 0.0f;
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float4 P_curve[2];
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
+		if(sd->type & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
+			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
 		}
 
-		r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w;
+		r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w;
 	}
 
 	return r*2.0f;
@@ -130,8 +131,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 
 ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
 {
-	float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+	float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 	int k1 = k0 + 1;
 
 	float4 P_curve[2];
@@ -139,23 +140,23 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 	P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 	P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 
-	return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u));
+	return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u);
 }
 
 /* Curve tangent normal */
 
 ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
-{	
+{
 	float3 tgN = make_float3(0.0f,0.0f,0.0f);
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 
-		tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu))));
+		tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu)));
 		tgN = normalize(tgN);
 
 		/* need to find suitable scaled gd for corrected normal */
 #if 0
-		tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu));
+		tgN = normalize(tgN - gd * sd->dPdu);
 #endif
 	}
 
@@ -213,817 +214,6 @@ ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta,
 	}
 }
 
-#ifdef __KERNEL_SSE2__
-ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
-{
-	return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2]));
-}
-#endif
-
-#ifdef __KERNEL_SSE2__
-/* Pass P and dir by reference to aligned vector */
-ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
-#else
-ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
-#endif
-{
-	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
-	float epsilon = 0.0f;
-	float r_st, r_en;
-
-	int depth = kernel_data.curve.subdivisions;
-	int flags = kernel_data.curve.curveflags;
-	int prim = kernel_tex_fetch(__prim_index, curveAddr);
-
-#ifdef __KERNEL_SSE2__
-	ssef vdir = load4f(dir);
-	ssef vcurve_coef[4];
-	const float3 *curve_coef = (float3 *)vcurve_coef;
-	
-	{
-		ssef dtmp = vdir * vdir;
-		ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
-		ssef rd_ss = load1f_first(1.0f) / d_ss;
-
-		ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]);
-		int2 &v00 = (int2 &)v00vec;
-
-		int k0 = v00.x + segment;
-		int k1 = k0 + 1;
-		int ka = max(k0 - 1, v00.x);
-		int kb = min(k1 + 1, v00.x + v00.y - 1);
-
-		ssef P_curve[4];
-
-		if(type & PRIMITIVE_CURVE) {
-			P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
-			P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
-			P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
-			P_curve[3] = load4f(&kg->__curve_keys.data[kb].x);
-		}
-		else {
-			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
-		}
-
-		ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
-		ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
-		ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy;
-		ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
-		ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
-
-		ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
-		ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
-		ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
-
-		ssef htfm[] = { htfm0, htfm1, htfm2 };
-		ssef vP = load4f(P);
-		ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
-		ssef p1 = transform_point_T3(htfm, P_curve[1] - vP);
-		ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
-		ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
-
-		float fc = 0.71f;
-		ssef vfc = ssef(fc);
-		ssef vfcxp3 = vfc * p3;
-
-		vcurve_coef[0] = p1;
-		vcurve_coef[1] = vfc * (p2 - p0);
-		vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
-		vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
-
-		r_st = ((float4 &)P_curve[1]).w;
-		r_en = ((float4 &)P_curve[2]).w;
-	}
-#else
-	float3 curve_coef[4];
-
-	/* curve Intersection check */
-	/* obtain curve parameters */
-	{
-		/* ray transform created - this should be created at beginning of intersection loop */
-		Transform htfm;
-		float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
-		htfm = make_transform(
-			dir.z / d, 0, -dir.x /d, 0,
-			-dir.x * dir.y /d, d, -dir.y * dir.z /d, 0,
-			dir.x, dir.y, dir.z, 0,
-			0, 0, 0, 1);
-
-		float4 v00 = kernel_tex_fetch(__curves, prim);
-
-		int k0 = __float_as_int(v00.x) + segment;
-		int k1 = k0 + 1;
-
-		int ka = max(k0 - 1,__float_as_int(v00.x));
-		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
-		float4 P_curve[4];
-
-		if(type & PRIMITIVE_CURVE) {
-			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
-			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
-			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
-			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
-		}
-		else {
-			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve);
-		}
-
-		float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P);
-		float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P);
-		float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P);
-		float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P);
-
-		float fc = 0.71f;
-		curve_coef[0] = p1;
-		curve_coef[1] = -fc*p0 + fc*p2;
-		curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
-		curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
-		r_st = P_curve[1].w;
-		r_en = P_curve[2].w;
-	}
-#endif
-
-	float r_curr = max(r_st, r_en);
-
-	if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
-		epsilon = 2 * r_curr;
-
-	/* find bounds - this is slow for cubic curves */
-	float upper, lower;
-
-	float zextrem[4];
-	curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
-	if(lower - r_curr > isect->t || upper + r_curr < epsilon)
-		return false;
-
-	/* minimum width extension */
-	float mw_extension = min(difl * fabsf(upper), extmax);
-	float r_ext = mw_extension + r_curr;
-
-	float xextrem[4];
-	curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
-	if(lower > r_ext || upper < -r_ext)
-		return false;
-
-	float yextrem[4];
-	curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
-	if(lower > r_ext || upper < -r_ext)
-		return false;
-
-	/* setup recurrent loop */
-	int level = 1 << depth;
-	int tree = 0;
-	float resol = 1.0f / (float)level;
-	bool hit = false;
-
-	/* begin loop */
-	while(!(tree >> (depth))) {
-		float i_st = tree * resol;
-		float i_en = i_st + (level * resol);
-#ifdef __KERNEL_SSE2__
-		ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
-		ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
-		ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
-
-		ssef vbmin = min(vp_st, vp_en);
-		ssef vbmax = max(vp_st, vp_en);
-
-		float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
-		float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
-		float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
-		float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
-#else
-		float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
-		float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
-		
-		float bminx = min(p_st.x, p_en.x);
-		float bmaxx = max(p_st.x, p_en.x);
-		float bminy = min(p_st.y, p_en.y);
-		float bmaxy = max(p_st.y, p_en.y);
-		float bminz = min(p_st.z, p_en.z);
-		float bmaxz = max(p_st.z, p_en.z);
-#endif
-
-		if(xextrem[0] >= i_st && xextrem[0] <= i_en) {
-			bminx = min(bminx,xextrem[1]);
-			bmaxx = max(bmaxx,xextrem[1]);
-		}
-		if(xextrem[2] >= i_st && xextrem[2] <= i_en) {
-			bminx = min(bminx,xextrem[3]);
-			bmaxx = max(bmaxx,xextrem[3]);
-		}
-		if(yextrem[0] >= i_st && yextrem[0] <= i_en) {
-			bminy = min(bminy,yextrem[1]);
-			bmaxy = max(bmaxy,yextrem[1]);
-		}
-		if(yextrem[2] >= i_st && yextrem[2] <= i_en) {
-			bminy = min(bminy,yextrem[3]);
-			bmaxy = max(bmaxy,yextrem[3]);
-		}
-		if(zextrem[0] >= i_st && zextrem[0] <= i_en) {
-			bminz = min(bminz,zextrem[1]);
-			bmaxz = max(bmaxz,zextrem[1]);
-		}
-		if(zextrem[2] >= i_st && zextrem[2] <= i_en) {
-			bminz = min(bminz,zextrem[3]);
-			bmaxz = max(bmaxz,zextrem[3]);
-		}
-
-		float r1 = r_st + (r_en - r_st) * i_st;
-		float r2 = r_st + (r_en - r_st) * i_en;
-		r_curr = max(r1, r2);
-
-		mw_extension = min(difl * fabsf(bmaxz), extmax);
-		float r_ext = mw_extension + r_curr;
-		float coverage = 1.0f;
-
-		if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
-			/* the bounding box does not overlap the square centered at O */
-			tree += level;
-			level = tree & -tree;
-		}
-		else if(level == 1) {
-
-			/* the maximum recursion depth is reached.
-			 * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
-			 * dP* is reversed if necessary.*/
-			float t = isect->t;
-			float u = 0.0f;
-			float gd = 0.0f;
-
-			if(flags & CURVE_KN_RIBBONS) {
-				float3 tg = (p_en - p_st);
-				float w = tg.x * tg.x + tg.y * tg.y;
-				if(w == 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
-				w = saturate(w);
-
-				/* compute u on the curve segment */
-				u = i_st * (1 - w) + i_en * w;
-				r_curr = r_st + (r_en - r_st) * u;
-				/* compare x-y distances */
-				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
-
-				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if(dot(tg, dp_st)< 0)
-					dp_st *= -1;
-				if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if(dot(tg, dp_en) < 0)
-					dp_en *= -1;
-				if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				/* compute coverage */
-				float r_ext = r_curr;
-				coverage = 1.0f;
-				if(difl != 0.0f) {
-					mw_extension = min(difl * fabsf(bmaxz), extmax);
-					r_ext = mw_extension + r_curr;
-					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
-					float d0 = d - r_curr;
-					float d1 = d + r_curr;
-					float inv_mw_extension = 1.0f/mw_extension;
-					if(d0 >= 0)
-						coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
-					else // inside
-						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
-				}
-				
-				if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				t = p_curr.z;
-
-				/* stochastic fade from minimum width */
-				if(difl != 0.0f && lcg_state) {
-					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
-						return hit;
-				}
-			}
-			else {
-				float l = len(p_en - p_st);
-				/* minimum width extension */
-				float or1 = r1;
-				float or2 = r2;
-
-				if(difl != 0.0f) {
-					mw_extension = min(len(p_st - P) * difl, extmax);
-					or1 = r1 < mw_extension ? mw_extension : r1;
-					mw_extension = min(len(p_en - P) * difl, extmax);
-					or2 = r2 < mw_extension ? mw_extension : r2;
-				}
-				/* --- */
-				float invl = 1.0f/l;
-				float3 tg = (p_en - p_st) * invl;
-				gd = (or2 - or1) * invl;
-				float difz = -dot(p_st,tg);
-				float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd));
-				float invcyla = 1.0f/cyla;
-				float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1)));
-				float tcentre = -halfb*invcyla;
-				float zcentre = difz + (tg.z * tcentre);
-				float3 tdif = - p_st;
-				tdif.z += tcentre;
-				float tdifz = dot(tdif,tg);
-				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
-				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
-				float td = tb*tb - 4*cyla*tc;
-				if(td < 0.0f) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-				
-				float rootd = sqrtf(td);
-				float correction = (-tb - rootd) * 0.5f * invcyla;
-				t = tcentre + correction;
-
-				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if(dot(tg, dp_st)< 0)
-					dp_st *= -1;
-				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if(dot(tg, dp_en) < 0)
-					dp_en *= -1;
-
-				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
-					correction = (-tb + rootd) * 0.5f * invcyla;
-					t = tcentre + correction;
-				}			
-
-				if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				float w = (zcentre + (tg.z * correction)) * invl;
-				w = saturate(w);
-				/* compute u on the curve segment */
-				u = i_st * (1 - w) + i_en * w;
-
-				/* stochastic fade from minimum width */
-				if(difl != 0.0f && lcg_state) {
-					r_curr = r1 + (r2 - r1) * w;
-					r_ext = or1 + (or2 - or1) * w;
-					coverage = r_curr/r_ext;
-
-					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
-						return hit;
-				}
-			}
-			/* we found a new intersection */
-
-#ifdef __VISIBILITY_FLAG__
-			/* visibility flag test. we do it here under the assumption
-			 * that most triangles are culled by node flags */
-			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#endif
-			{
-				/* record intersection */
-				isect->t = t;
-				isect->u = u;
-				isect->v = gd;
-				isect->prim = curveAddr;
-				isect->object = object;
-				isect->type = type;
-				hit = true;
-			}
-			
-			tree++;
-			level = tree & -tree;
-		}
-		else {
-			/* split the curve into two curves and process */
-			level = level >> 1;
-		}
-	}
-
-	return hit;
-}
-
-ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
-{
-	/* define few macros to minimize code duplication for SSE */
-#ifndef __KERNEL_SSE2__
-#  define len3_squared(x) len_squared(x)
-#  define len3(x) len(x)
-#  define dot3(x, y) dot(x, y)
-#endif
-
-	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
-	/* curve Intersection check */
-	int flags = kernel_data.curve.curveflags;
-
-	int prim = kernel_tex_fetch(__prim_index, curveAddr);
-	float4 v00 = kernel_tex_fetch(__curves, prim);
-
-	int cnum = __float_as_int(v00.x);
-	int k0 = cnum + segment;
-	int k1 = k0 + 1;
-
-#ifndef __KERNEL_SSE2__
-	float4 P_curve[2];
-
-	if(type & PRIMITIVE_CURVE) {
-		P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
-		P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
-	}
-	else {
-		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-		motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve);
-	}
-
-	float or1 = P_curve[0].w;
-	float or2 = P_curve[1].w;
-	float3 p1 = float4_to_float3(P_curve[0]);
-	float3 p2 = float4_to_float3(P_curve[1]);
-
-	/* minimum width extension */
-	float r1 = or1;
-	float r2 = or2;
-	float3 dif = P - p1;
-	float3 dif_second = P - p2;
-	if(difl != 0.0f) {
-		float pixelsize = min(len3(dif) * difl, extmax);
-		r1 = or1 < pixelsize ? pixelsize : or1;
-		pixelsize = min(len3(dif_second) * difl, extmax);
-		r2 = or2 < pixelsize ? pixelsize : or2;
-	}
-	/* --- */
-
-	float3 p21_diff = p2 - p1;
-	float3 sphere_dif1 = (dif + dif_second) * 0.5f;
-	float3 dir = direction;
-	float sphere_b_tmp = dot3(dir, sphere_dif1);
-	float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
-#else
-	ssef P_curve[2];
-	
-	if(type & PRIMITIVE_CURVE) {
-		P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
-		P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
-	}
-	else {
-		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-		motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve);
-	}
-
-	const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
-
-	ssef r12 = or12;
-	const ssef vP = load4f(P);
-	const ssef dif = vP - P_curve[0];
-	const ssef dif_second = vP - P_curve[1];
-	if(difl != 0.0f) {
-		const ssef len1_sq = len3_squared_splat(dif);
-		const ssef len2_sq = len3_squared_splat(dif_second);
-		const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
-		const ssef pixelsize12 = min(len12 * difl, ssef(extmax));
-		r12 = max(or12, pixelsize12);
-	}
-	float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12));
-	float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12));
-
-	const ssef p21_diff = P_curve[1] - P_curve[0];
-	const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
-	const ssef dir = load4f(direction);
-	const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
-	const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1);
-#endif
-
-	float mr = max(r1, r2);
-	float l = len3(p21_diff);
-	float invl = 1.0f / l;
-	float sp_r = mr + 0.5f * l;
-
-	float sphere_b = dot3(dir, sphere_dif2);
-	float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
-
-	if(sdisc < 0.0f)
-		return false;
-
-	/* obtain parameters and test midpoint distance for suitable modes */
-#ifndef __KERNEL_SSE2__
-	float3 tg = p21_diff * invl;
-#else
-	const ssef tg = p21_diff * invl;
-#endif
-	float gd = (r2 - r1) * invl;
-
-	float dirz = dot3(dir, tg);
-	float difz = dot3(dif, tg);
-
-	float a = 1.0f - (dirz*dirz*(1 + gd*gd));
-
-	float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1));
-
-	float tcentre = -halfb/a;
-	float zcentre = difz + (dirz * tcentre);
-
-	if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
-		return false;
-	if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
-		return false;
-
-	/* test minimum separation */
-#ifndef __KERNEL_SSE2__
-	float3 cprod = cross(tg, dir);
-	float cprod2sq = len3_squared(cross(tg, dif));
-#else
-	const ssef cprod = cross(tg, dir);
-	float cprod2sq = len3_squared(cross_zxy(tg, dif));
-#endif
-	float cprodsq = len3_squared(cprod);
-	float distscaled = dot3(cprod, dif);
-
-	if(cprodsq == 0)
-		distscaled = cprod2sq;
-	else
-		distscaled = (distscaled*distscaled)/cprodsq;
-
-	if(distscaled > mr*mr)
-		return false;
-
-	/* calculate true intersection */
-#ifndef __KERNEL_SSE2__
-	float3 tdif = dif + tcentre * dir;
-#else
-	const ssef tdif = madd(ssef(tcentre), dir, dif);
-#endif
-	float tdifz = dot3(tdif, tg);
-	float tdifma = tdifz*gd + r1;
-	float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma));
-	float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
-	float td = tb*tb - 4*a*tc;
-
-	if(td < 0.0f)
-		return false;
-
-	float rootd = 0.0f;
-	float correction = 0.0f;
-	if(flags & CURVE_KN_ACCURATE) {
-		rootd = sqrtf(td);
-		correction = ((-tb - rootd)/(2*a));
-	}
-
-	float t = tcentre + correction;
-
-	if(t < isect->t) {
-
-		if(flags & CURVE_KN_INTERSECTCORRECTION) {
-			rootd = sqrtf(td);
-			correction = ((-tb - rootd)/(2*a));
-			t = tcentre + correction;
-		}
-
-		float z = zcentre + (dirz * correction);
-		// bool backface = false;
-
-		if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
-			// backface = true;
-			correction = ((-tb + rootd)/(2*a));
-			t = tcentre + correction;
-			z = zcentre + (dirz * correction);
-		}
-
-		/* stochastic fade from minimum width */
-		float adjradius = or1 + z * (or2 - or1) * invl;
-		adjradius = adjradius / (r1 + z * gd);
-		if(lcg_state && adjradius != 1.0f) {
-			if(lcg_step_float(lcg_state) > adjradius)
-				return false;
-		}
-		/* --- */
-
-		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
-
-			if(flags & CURVE_KN_ENCLOSEFILTER) {
-				float enc_ratio = 1.01f;
-				if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
-					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
-					float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
-					if(a2*c2 < 0.0f)
-						return false;
-				}
-			}
-
-#ifdef __VISIBILITY_FLAG__
-			/* visibility flag test. we do it here under the assumption
-			 * that most triangles are culled by node flags */
-			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#endif
-			{
-				/* record intersection */
-				isect->t = t;
-				isect->u = z*invl;
-				isect->v = gd;
-				isect->prim = curveAddr;
-				isect->object = object;
-				isect->type = type;
-
-				return true;
-			}
-		}
-	}
-
-	return false;
-
-#ifndef __KERNEL_SSE2__
-#  undef len3_squared
-#  undef len3
-#  undef dot3
-#  endif
-}
-
-ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-	float fc = 0.71f;
-	float data[4];
-	float t2 = t * t;
-	data[0] = -3.0f * fc          * t2  + 4.0f * fc * t                  - fc;
-	data[1] =  3.0f * (2.0f - fc) * t2  + 2.0f * (fc - 3.0f) * t;
-	data[2] =  3.0f * (fc - 2.0f) * t2  + 2.0f * (3.0f - 2.0f * fc) * t  + fc;
-	data[3] =  3.0f * fc          * t2  - 2.0f * fc * t;
-	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-	float data[4];
-	float fc = 0.71f;
-	float t2 = t * t;
-	float t3 = t2 * t;
-	data[0] = -fc          * t3  + 2.0f * fc          * t2 - fc * t;
-	data[1] =  (2.0f - fc) * t3  + (fc - 3.0f)        * t2 + 1.0f;
-	data[2] =  (fc - 2.0f) * t3  + (3.0f - 2.0f * fc) * t2 + fc * t;
-	data[3] =  fc          * t3  - fc * t2;
-	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
-{
-	int flag = kernel_data.curve.curveflags;
-	float t = isect->t;
-	float3 P = ray->P;
-	float3 D = ray->D;
-
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D*t);
-		D = normalize_len(D, &t);
-	}
-
-	int prim = kernel_tex_fetch(__prim_index, isect->prim);
-	float4 v00 = kernel_tex_fetch(__curves, prim);
-
-	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
-	int k1 = k0 + 1;
-
-	float3 tg;
-
-	if(flag & CURVE_KN_INTERPOLATE) {
-		int ka = max(k0 - 1,__float_as_int(v00.x));
-		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
-		float4 P_curve[4];
-
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
-			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
-			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
-			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
-			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
-		}
-		else {
-			motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve);
-		}
-
-		float3 p[4];
-		p[0] = float4_to_float3(P_curve[0]);
-		p[1] = float4_to_float3(P_curve[1]);
-		p[2] = float4_to_float3(P_curve[2]);
-		p[3] = float4_to_float3(P_curve[3]);
-
-		P = P + D*t;
-
-#ifdef __UV__
-		ccl_fetch(sd, u) = isect->u;
-		ccl_fetch(sd, v) = 0.0f;
-#endif
-
-		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
-
-		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-			ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D))));
-		}
-		else {
-			/* direction from inside to surface of curve */
-			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
-			ccl_fetch(sd, Ng) = normalize(P - p_curr);
-
-			/* adjustment for changing radius */
-			float gd = isect->v;
-
-			if(gd != 0.0f) {
-				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
-				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
-			}
-		}
-
-		/* todo: sometimes the normal is still so that this is detected as
-		 * backfacing even if cull backfaces is enabled */
-
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
-	}
-	else {
-		float4 P_curve[2];
-
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
-			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
-			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
-		}
-		else {
-			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
-		}
-
-		float l = 1.0f;
-		tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l);
-		
-		P = P + D*t;
-
-		float3 dif = P - float4_to_float3(P_curve[0]);
-
-#ifdef __UV__
-		ccl_fetch(sd, u) = dot(dif,tg)/l;
-		ccl_fetch(sd, v) = 0.0f;
-#endif
-
-		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
-			ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D));
-			ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
-		}
-		else {
-			float gd = isect->v;
-
-			/* direction from inside to surface of curve */
-			ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd);
-
-			/* adjustment for changing radius */
-			if(gd != 0.0f) {
-				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
-				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
-			}
-		}
-
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
-	}
-
-#ifdef __DPDU__
-	/* dPdu/dPdv */
-	ccl_fetch(sd, dPdu) = tg;
-	ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng));
-#endif
-
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-}
-
-#endif
+#endif  /* __HAIR__ */
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
new file mode 100644
index 00000000000..e9a149ea1ab
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -0,0 +1,934 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Curve primitive intersection functions. */
+
+#ifdef __HAIR__
+
+#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300)
+#  define ccl_device_curveintersect ccl_device
+#else
+#  define ccl_device_curveintersect ccl_device_forceinline
+#endif
+
+#ifdef __KERNEL_SSE2__
+ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
+{
+	return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2]));
+}
+#endif
+
+/* On CPU pass P and dir by reference to aligned vector. */
+ccl_device_curveintersect bool cardinal_curve_intersect(
+        KernelGlobals *kg,
+        Intersection *isect,
+        const float3 ccl_ref P,
+        const float3 ccl_ref dir,
+        uint visibility,
+        int object,
+        int curveAddr,
+        float time,
+        int type,
+        uint *lcg_state,
+        float difl,
+        float extmax)
+{
+	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+		if(time < prim_time.x || time > prim_time.y) {
+			return false;
+		}
+	}
+
+	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
+	float epsilon = 0.0f;
+	float r_st, r_en;
+
+	int depth = kernel_data.curve.subdivisions;
+	int flags = kernel_data.curve.curveflags;
+	int prim = kernel_tex_fetch(__prim_index, curveAddr);
+
+#ifdef __KERNEL_SSE2__
+	ssef vdir = load4f(dir);
+	ssef vcurve_coef[4];
+	const float3 *curve_coef = (float3 *)vcurve_coef;
+
+	{
+		ssef dtmp = vdir * vdir;
+		ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
+		ssef rd_ss = load1f_first(1.0f) / d_ss;
+
+		ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]);
+		int2 &v00 = (int2 &)v00vec;
+
+		int k0 = v00.x + segment;
+		int k1 = k0 + 1;
+		int ka = max(k0 - 1, v00.x);
+		int kb = min(k1 + 1, v00.x + v00.y - 1);
+
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
+		avxf P_curve_0_1, P_curve_2_3;
+		if(is_curve_primitive) {
+			P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
+			P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
+			motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3);
+		}
+#else  /* __KERNEL_AVX2__ */
+		ssef P_curve[4];
+
+		if(is_curve_primitive) {
+			P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
+			P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
+			P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
+			P_curve[3] = load4f(&kg->__curve_keys.data[kb].x);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
+		}
+#endif  /* __KERNEL_AVX2__ */
+
+		ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
+		ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
+		ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy;
+		ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
+		ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
+
+		ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
+		ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
+		ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
+
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
+		const avxf vPP = _mm256_broadcast_ps(&P.m128);
+		const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
+		const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
+		const avxf htfm22 = avxf(htfm2.m128, htfm2.m128);
+
+		const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP),
+		                      htfm00,
+		                      madd(shuffle<1>(P_curve_0_1 - vPP),
+		                           htfm11,
+		                           shuffle<2>(P_curve_0_1 - vPP) * htfm22));
+		const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP),
+		                      htfm00,
+		                      madd(shuffle<1>(P_curve_2_3 - vPP),
+		                           htfm11,
+		                           shuffle<2>(P_curve_2_3 - vPP)*htfm22));
+
+		const ssef p0 = _mm256_castps256_ps128(p01);
+		const ssef p1 = _mm256_extractf128_ps(p01, 1);
+		const ssef p2 = _mm256_castps256_ps128(p23);
+		const ssef p3 = _mm256_extractf128_ps(p23, 1);
+
+		const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1);
+		r_st = ((float4 &)P_curve_1).w;
+		const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3);
+		r_en = ((float4 &)P_curve_2).w;
+#else  /* __KERNEL_AVX2__ */
+		ssef htfm[] = { htfm0, htfm1, htfm2 };
+		ssef vP = load4f(P);
+		ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
+		ssef p1 = transform_point_T3(htfm, P_curve[1] - vP);
+		ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
+		ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
+
+		r_st = ((float4 &)P_curve[1]).w;
+		r_en = ((float4 &)P_curve[2]).w;
+#endif  /* __KERNEL_AVX2__ */
+
+		float fc = 0.71f;
+		ssef vfc = ssef(fc);
+		ssef vfcxp3 = vfc * p3;
+
+		vcurve_coef[0] = p1;
+		vcurve_coef[1] = vfc * (p2 - p0);
+		vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
+		vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
+
+	}
+#else
+	float3 curve_coef[4];
+
+	/* curve Intersection check */
+	/* obtain curve parameters */
+	{
+		/* ray transform created - this should be created at beginning of intersection loop */
+		Transform htfm;
+		float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
+		htfm = make_transform(
+			dir.z / d, 0, -dir.x /d, 0,
+			-dir.x * dir.y /d, d, -dir.y * dir.z /d, 0,
+			dir.x, dir.y, dir.z, 0,
+			0, 0, 0, 1);
+
+		float4 v00 = kernel_tex_fetch(__curves, prim);
+
+		int k0 = __float_as_int(v00.x) + segment;
+		int k1 = k0 + 1;
+
+		int ka = max(k0 - 1,__float_as_int(v00.x));
+		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+
+		float4 P_curve[4];
+
+		if(is_curve_primitive) {
+			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
+			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
+			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
+			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve);
+		}
+
+		float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P);
+		float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P);
+		float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P);
+		float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P);
+
+		float fc = 0.71f;
+		curve_coef[0] = p1;
+		curve_coef[1] = -fc*p0 + fc*p2;
+		curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
+		curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
+		r_st = P_curve[1].w;
+		r_en = P_curve[2].w;
+	}
+#endif
+
+	float r_curr = max(r_st, r_en);
+
+	if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
+		epsilon = 2 * r_curr;
+
+	/* find bounds - this is slow for cubic curves */
+	float upper, lower;
+
+	float zextrem[4];
+	curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
+	if(lower - r_curr > isect->t || upper + r_curr < epsilon)
+		return false;
+
+	/* minimum width extension */
+	float mw_extension = min(difl * fabsf(upper), extmax);
+	float r_ext = mw_extension + r_curr;
+
+	float xextrem[4];
+	curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
+	if(lower > r_ext || upper < -r_ext)
+		return false;
+
+	float yextrem[4];
+	curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
+	if(lower > r_ext || upper < -r_ext)
+		return false;
+
+	/* setup recurrent loop */
+	int level = 1 << depth;
+	int tree = 0;
+	float resol = 1.0f / (float)level;
+	bool hit = false;
+
+	/* begin loop */
+	while(!(tree >> (depth))) {
+		const float i_st = tree * resol;
+		const float i_en = i_st + (level * resol);
+
+#ifdef __KERNEL_SSE2__
+		ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
+		ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
+		ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
+
+		ssef vbmin = min(vp_st, vp_en);
+		ssef vbmax = max(vp_st, vp_en);
+
+		float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
+		float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
+		float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
+		float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
+#else
+		float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
+		float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
+
+		float bminx = min(p_st.x, p_en.x);
+		float bmaxx = max(p_st.x, p_en.x);
+		float bminy = min(p_st.y, p_en.y);
+		float bmaxy = max(p_st.y, p_en.y);
+		float bminz = min(p_st.z, p_en.z);
+		float bmaxz = max(p_st.z, p_en.z);
+#endif
+
+		if(xextrem[0] >= i_st && xextrem[0] <= i_en) {
+			bminx = min(bminx,xextrem[1]);
+			bmaxx = max(bmaxx,xextrem[1]);
+		}
+		if(xextrem[2] >= i_st && xextrem[2] <= i_en) {
+			bminx = min(bminx,xextrem[3]);
+			bmaxx = max(bmaxx,xextrem[3]);
+		}
+		if(yextrem[0] >= i_st && yextrem[0] <= i_en) {
+			bminy = min(bminy,yextrem[1]);
+			bmaxy = max(bmaxy,yextrem[1]);
+		}
+		if(yextrem[2] >= i_st && yextrem[2] <= i_en) {
+			bminy = min(bminy,yextrem[3]);
+			bmaxy = max(bmaxy,yextrem[3]);
+		}
+		if(zextrem[0] >= i_st && zextrem[0] <= i_en) {
+			bminz = min(bminz,zextrem[1]);
+			bmaxz = max(bmaxz,zextrem[1]);
+		}
+		if(zextrem[2] >= i_st && zextrem[2] <= i_en) {
+			bminz = min(bminz,zextrem[3]);
+			bmaxz = max(bmaxz,zextrem[3]);
+		}
+
+		float r1 = r_st + (r_en - r_st) * i_st;
+		float r2 = r_st + (r_en - r_st) * i_en;
+		r_curr = max(r1, r2);
+
+		mw_extension = min(difl * fabsf(bmaxz), extmax);
+		float r_ext = mw_extension + r_curr;
+		float coverage = 1.0f;
+
+		if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
+			/* the bounding box does not overlap the square centered at O */
+			tree += level;
+			level = tree & -tree;
+		}
+		else if(level == 1) {
+
+			/* the maximum recursion depth is reached.
+			 * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
+			 * dP* is reversed if necessary.*/
+			float t = isect->t;
+			float u = 0.0f;
+			float gd = 0.0f;
+
+			if(flags & CURVE_KN_RIBBONS) {
+				float3 tg = (p_en - p_st);
+#ifdef __KERNEL_SSE__
+				const float3 tg_sq = tg * tg;
+				float w = tg_sq.x + tg_sq.y;
+#else
+				float w = tg.x * tg.x + tg.y * tg.y;
+#endif
+				if(w == 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+#ifdef __KERNEL_SSE__
+				const float3 p_sttg = p_st * tg;
+				w = -(p_sttg.x + p_sttg.y) / w;
+#else
+				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
+#endif
+				w = saturate(w);
+
+				/* compute u on the curve segment */
+				u = i_st * (1 - w) + i_en * w;
+				r_curr = r_st + (r_en - r_st) * u;
+				/* compare x-y distances */
+				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
+
+				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
+				if(dot(tg, dp_st)< 0)
+					dp_st *= -1;
+				if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
+				if(dot(tg, dp_en) < 0)
+					dp_en *= -1;
+				if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				/* compute coverage */
+				float r_ext = r_curr;
+				coverage = 1.0f;
+				if(difl != 0.0f) {
+					mw_extension = min(difl * fabsf(bmaxz), extmax);
+					r_ext = mw_extension + r_curr;
+#ifdef __KERNEL_SSE__
+					const float3 p_curr_sq = p_curr * p_curr;
+					const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)));
+					float d = dxxx.x;
+#else
+					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
+#endif
+					float d0 = d - r_curr;
+					float d1 = d + r_curr;
+					float inv_mw_extension = 1.0f/mw_extension;
+					if(d0 >= 0)
+						coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
+					else // inside
+						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
+				}
+
+				if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				t = p_curr.z;
+
+				/* stochastic fade from minimum width */
+				if(difl != 0.0f && lcg_state) {
+					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
+						return hit;
+				}
+			}
+			else {
+				float l = len(p_en - p_st);
+				/* minimum width extension */
+				float or1 = r1;
+				float or2 = r2;
+
+				if(difl != 0.0f) {
+					mw_extension = min(len(p_st - P) * difl, extmax);
+					or1 = r1 < mw_extension ? mw_extension : r1;
+					mw_extension = min(len(p_en - P) * difl, extmax);
+					or2 = r2 < mw_extension ? mw_extension : r2;
+				}
+				/* --- */
+				float invl = 1.0f/l;
+				float3 tg = (p_en - p_st) * invl;
+				gd = (or2 - or1) * invl;
+				float difz = -dot(p_st,tg);
+				float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd));
+				float invcyla = 1.0f/cyla;
+				float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1)));
+				float tcentre = -halfb*invcyla;
+				float zcentre = difz + (tg.z * tcentre);
+				float3 tdif = - p_st;
+				tdif.z += tcentre;
+				float tdifz = dot(tdif,tg);
+				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
+				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
+				float td = tb*tb - 4*cyla*tc;
+				if(td < 0.0f) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				float rootd = sqrtf(td);
+				float correction = (-tb - rootd) * 0.5f * invcyla;
+				t = tcentre + correction;
+
+				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
+				if(dot(tg, dp_st)< 0)
+					dp_st *= -1;
+				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
+				if(dot(tg, dp_en) < 0)
+					dp_en *= -1;
+
+				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
+					correction = (-tb + rootd) * 0.5f * invcyla;
+					t = tcentre + correction;
+				}
+
+				if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				float w = (zcentre + (tg.z * correction)) * invl;
+				w = saturate(w);
+				/* compute u on the curve segment */
+				u = i_st * (1 - w) + i_en * w;
+
+				/* stochastic fade from minimum width */
+				if(difl != 0.0f && lcg_state) {
+					r_curr = r1 + (r2 - r1) * w;
+					r_ext = or1 + (or2 - or1) * w;
+					coverage = r_curr/r_ext;
+
+					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
+						return hit;
+				}
+			}
+			/* we found a new intersection */
+
+#ifdef __VISIBILITY_FLAG__
+			/* visibility flag test. we do it here under the assumption
+			 * that most triangles are culled by node flags */
+			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
+#endif
+			{
+				/* record intersection */
+				isect->t = t;
+				isect->u = u;
+				isect->v = gd;
+				isect->prim = curveAddr;
+				isect->object = object;
+				isect->type = type;
+				hit = true;
+			}
+
+			tree++;
+			level = tree & -tree;
+		}
+		else {
+			/* split the curve into two curves and process */
+			level = level >> 1;
+		}
+	}
+
+	return hit;
+}
+
+ccl_device_curveintersect bool curve_intersect(KernelGlobals *kg,
+                                               Intersection *isect,
+                                               float3 P,
+                                               float3 direction,
+                                               uint visibility,
+                                               int object,
+                                               int curveAddr,
+                                               float time,
+                                               int type,
+                                               uint *lcg_state,
+                                               float difl,
+                                               float extmax)
+{
+	/* define few macros to minimize code duplication for SSE */
+#ifndef __KERNEL_SSE2__
+#  define len3_squared(x) len_squared(x)
+#  define len3(x) len(x)
+#  define dot3(x, y) dot(x, y)
+#endif
+
+	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+		if(time < prim_time.x || time > prim_time.y) {
+			return false;
+		}
+	}
+
+	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
+	/* curve Intersection check */
+	int flags = kernel_data.curve.curveflags;
+
+	int prim = kernel_tex_fetch(__prim_index, curveAddr);
+	float4 v00 = kernel_tex_fetch(__curves, prim);
+
+	int cnum = __float_as_int(v00.x);
+	int k0 = cnum + segment;
+	int k1 = k0 + 1;
+
+#ifndef __KERNEL_SSE2__
+	float4 P_curve[2];
+
+	if(is_curve_primitive) {
+		P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
+		P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
+	}
+	else {
+		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+		motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve);
+	}
+
+	float or1 = P_curve[0].w;
+	float or2 = P_curve[1].w;
+	float3 p1 = float4_to_float3(P_curve[0]);
+	float3 p2 = float4_to_float3(P_curve[1]);
+
+	/* minimum width extension */
+	float r1 = or1;
+	float r2 = or2;
+	float3 dif = P - p1;
+	float3 dif_second = P - p2;
+	if(difl != 0.0f) {
+		float pixelsize = min(len3(dif) * difl, extmax);
+		r1 = or1 < pixelsize ? pixelsize : or1;
+		pixelsize = min(len3(dif_second) * difl, extmax);
+		r2 = or2 < pixelsize ? pixelsize : or2;
+	}
+	/* --- */
+
+	float3 p21_diff = p2 - p1;
+	float3 sphere_dif1 = (dif + dif_second) * 0.5f;
+	float3 dir = direction;
+	float sphere_b_tmp = dot3(dir, sphere_dif1);
+	float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
+#else
+	ssef P_curve[2];
+
+	if(is_curve_primitive) {
+		P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
+		P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
+	}
+	else {
+		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+		motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve);
+	}
+
+	const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
+
+	ssef r12 = or12;
+	const ssef vP = load4f(P);
+	const ssef dif = vP - P_curve[0];
+	const ssef dif_second = vP - P_curve[1];
+	if(difl != 0.0f) {
+		const ssef len1_sq = len3_squared_splat(dif);
+		const ssef len2_sq = len3_squared_splat(dif_second);
+		const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
+		const ssef pixelsize12 = min(len12 * difl, ssef(extmax));
+		r12 = max(or12, pixelsize12);
+	}
+	float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12));
+	float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12));
+
+	const ssef p21_diff = P_curve[1] - P_curve[0];
+	const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
+	const ssef dir = load4f(direction);
+	const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
+	const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1);
+#endif
+
+	float mr = max(r1, r2);
+	float l = len3(p21_diff);
+	float invl = 1.0f / l;
+	float sp_r = mr + 0.5f * l;
+
+	float sphere_b = dot3(dir, sphere_dif2);
+	float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
+
+	if(sdisc < 0.0f)
+		return false;
+
+	/* obtain parameters and test midpoint distance for suitable modes */
+#ifndef __KERNEL_SSE2__
+	float3 tg = p21_diff * invl;
+#else
+	const ssef tg = p21_diff * invl;
+#endif
+	float gd = (r2 - r1) * invl;
+
+	float dirz = dot3(dir, tg);
+	float difz = dot3(dif, tg);
+
+	float a = 1.0f - (dirz*dirz*(1 + gd*gd));
+
+	float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1));
+
+	float tcentre = -halfb/a;
+	float zcentre = difz + (dirz * tcentre);
+
+	if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
+		return false;
+	if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
+		return false;
+
+	/* test minimum separation */
+#ifndef __KERNEL_SSE2__
+	float3 cprod = cross(tg, dir);
+	float cprod2sq = len3_squared(cross(tg, dif));
+#else
+	const ssef cprod = cross(tg, dir);
+	float cprod2sq = len3_squared(cross_zxy(tg, dif));
+#endif
+	float cprodsq = len3_squared(cprod);
+	float distscaled = dot3(cprod, dif);
+
+	if(cprodsq == 0)
+		distscaled = cprod2sq;
+	else
+		distscaled = (distscaled*distscaled)/cprodsq;
+
+	if(distscaled > mr*mr)
+		return false;
+
+	/* calculate true intersection */
+#ifndef __KERNEL_SSE2__
+	float3 tdif = dif + tcentre * dir;
+#else
+	const ssef tdif = madd(ssef(tcentre), dir, dif);
+#endif
+	float tdifz = dot3(tdif, tg);
+	float tdifma = tdifz*gd + r1;
+	float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma));
+	float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
+	float td = tb*tb - 4*a*tc;
+
+	if(td < 0.0f)
+		return false;
+
+	float rootd = 0.0f;
+	float correction = 0.0f;
+	if(flags & CURVE_KN_ACCURATE) {
+		rootd = sqrtf(td);
+		correction = ((-tb - rootd)/(2*a));
+	}
+
+	float t = tcentre + correction;
+
+	if(t < isect->t) {
+
+		if(flags & CURVE_KN_INTERSECTCORRECTION) {
+			rootd = sqrtf(td);
+			correction = ((-tb - rootd)/(2*a));
+			t = tcentre + correction;
+		}
+
+		float z = zcentre + (dirz * correction);
+		// bool backface = false;
+
+		if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
+			// backface = true;
+			correction = ((-tb + rootd)/(2*a));
+			t = tcentre + correction;
+			z = zcentre + (dirz * correction);
+		}
+
+		/* stochastic fade from minimum width */
+		float adjradius = or1 + z * (or2 - or1) * invl;
+		adjradius = adjradius / (r1 + z * gd);
+		if(lcg_state && adjradius != 1.0f) {
+			if(lcg_step_float(lcg_state) > adjradius)
+				return false;
+		}
+		/* --- */
+
+		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
+
+			if(flags & CURVE_KN_ENCLOSEFILTER) {
+				float enc_ratio = 1.01f;
+				if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
+					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
+					float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
+					if(a2*c2 < 0.0f)
+						return false;
+				}
+			}
+
+#ifdef __VISIBILITY_FLAG__
+			/* visibility flag test. we do it here under the assumption
+			 * that most triangles are culled by node flags */
+			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
+#endif
+			{
+				/* record intersection */
+				isect->t = t;
+				isect->u = z*invl;
+				isect->v = gd;
+				isect->prim = curveAddr;
+				isect->object = object;
+				isect->type = type;
+
+				return true;
+			}
+		}
+	}
+
+	return false;
+
+#ifndef __KERNEL_SSE2__
+#  undef len3_squared
+#  undef len3
+#  undef dot3
+#endif
+}
+
+ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+{
+	float fc = 0.71f;
+	float data[4];
+	float t2 = t * t;
+	data[0] = -3.0f * fc          * t2  + 4.0f * fc * t                  - fc;
+	data[1] =  3.0f * (2.0f - fc) * t2  + 2.0f * (fc - 3.0f) * t;
+	data[2] =  3.0f * (fc - 2.0f) * t2  + 2.0f * (3.0f - 2.0f * fc) * t  + fc;
+	data[3] =  3.0f * fc          * t2  - 2.0f * fc * t;
+	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
+}
+
+ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+{
+	float data[4];
+	float fc = 0.71f;
+	float t2 = t * t;
+	float t3 = t2 * t;
+	data[0] = -fc          * t3  + 2.0f * fc          * t2 - fc * t;
+	data[1] =  (2.0f - fc) * t3  + (fc - 3.0f)        * t2 + 1.0f;
+	data[2] =  (fc - 2.0f) * t3  + (3.0f - 2.0f * fc) * t2 + fc * t;
+	data[3] =  fc          * t3  - fc * t2;
+	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
+}
+
+ccl_device_inline float3 curve_refine(KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      const Intersection *isect,
+                                      const Ray *ray)
+{
+	int flag = kernel_data.curve.curveflags;
+	float t = isect->t;
+	float3 P = ray->P;
+	float3 D = ray->D;
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_itfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D*t);
+		D = normalize_len(D, &t);
+	}
+
+	int prim = kernel_tex_fetch(__prim_index, isect->prim);
+	float4 v00 = kernel_tex_fetch(__curves, prim);
+
+	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	int k1 = k0 + 1;
+
+	float3 tg;
+
+	if(flag & CURVE_KN_INTERPOLATE) {
+		int ka = max(k0 - 1,__float_as_int(v00.x));
+		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+
+		float4 P_curve[4];
+
+		if(sd->type & PRIMITIVE_CURVE) {
+			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
+			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
+			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
+			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+		}
+		else {
+			motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
+		}
+
+		float3 p[4];
+		p[0] = float4_to_float3(P_curve[0]);
+		p[1] = float4_to_float3(P_curve[1]);
+		p[2] = float4_to_float3(P_curve[2]);
+		p[3] = float4_to_float3(P_curve[3]);
+
+		P = P + D*t;
+
+#ifdef __UV__
+		sd->u = isect->u;
+		sd->v = 0.0f;
+#endif
+
+		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
+
+		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
+			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
+		}
+		else {
+			/* direction from inside to surface of curve */
+			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
+			sd->Ng = normalize(P - p_curr);
+
+			/* adjustment for changing radius */
+			float gd = isect->v;
+
+			if(gd != 0.0f) {
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
+			}
+		}
+
+		/* todo: sometimes the normal is still so that this is detected as
+		 * backfacing even if cull backfaces is enabled */
+
+		sd->N = sd->Ng;
+	}
+	else {
+		float4 P_curve[2];
+
+		if(sd->type & PRIMITIVE_CURVE) {
+			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
+			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
+		}
+		else {
+			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+		}
+
+		float l = 1.0f;
+		tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l);
+
+		P = P + D*t;
+
+		float3 dif = P - float4_to_float3(P_curve[0]);
+
+#ifdef __UV__
+		sd->u = dot(dif,tg)/l;
+		sd->v = 0.0f;
+#endif
+
+		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
+			sd->Ng = -(D - tg * dot(tg, D));
+			sd->Ng = normalize(sd->Ng);
+		}
+		else {
+			float gd = isect->v;
+
+			/* direction from inside to surface of curve */
+			sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
+
+			/* adjustment for changing radius */
+			if(gd != 0.0f) {
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
+			}
+		}
+
+		sd->N = sd->Ng;
+	}
+
+#ifdef __DPDU__
+	/* dPdu/dPdv */
+	sd->dPdu = tg;
+	sd->dPdv = cross(tg, sd->Ng);
+#endif
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_tfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+}
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h
index 6de5aa7ea99..119bdb2f15c 100644
--- a/intern/cycles/kernel/geom/geom_motion_curve.h
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -50,12 +50,12 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg, int object,
 ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg, int offset, int numkeys, int numsteps, int step, int k0, int k1, float4 keys[2])
 {
 	if(step == numsteps) {
-		/* center step: regular vertex location */
+		/* center step: regular key location */
 		keys[0] = kernel_tex_fetch(__curve_keys, k0);
 		keys[1] = kernel_tex_fetch(__curve_keys, k1);
 	}
 	else {
-		/* center step not stored in this array */
+		/* center step is not stored in this array */
 		if(step > numsteps)
 			step--;
 
@@ -97,14 +97,14 @@ ccl_device_inline void motion_curve_keys(KernelGlobals *kg, int object, int prim
 ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg, int offset, int numkeys, int numsteps, int step, int k0, int k1, int k2, int k3, float4 keys[4])
 {
 	if(step == numsteps) {
-		/* center step: regular vertex location */
+		/* center step: regular key location */
 		keys[0] = kernel_tex_fetch(__curve_keys, k0);
 		keys[1] = kernel_tex_fetch(__curve_keys, k1);
 		keys[2] = kernel_tex_fetch(__curve_keys, k2);
 		keys[3] = kernel_tex_fetch(__curve_keys, k3);
 	}
 	else {
-		/* center step not store in this array */
+		/* center step is not stored in this array */
 		if(step > numsteps)
 			step--;
 
@@ -118,7 +118,12 @@ ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg, in
 }
 
 /* return 2 curve key locations */
-ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, int object, int prim, float time, int k0, int k1, int k2, int k3, float4 keys[4])
+ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg,
+                                                  int object,
+                                                  int prim,
+                                                  float time,
+                                                  int k0, int k1, int k2, int k3,
+                                                  float4 keys[4])
 {
 	/* get motion info */
 	int numsteps, numkeys;
@@ -147,6 +152,65 @@ ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, int object,
 	keys[3] = (1.0f - t)*keys[3] + t*next_keys[3];
 }
 
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
+/* Similar to above, but returns keys as pair of two AVX registers with each
+ * holding two float4.
+ */
+ccl_device_inline void motion_cardinal_curve_keys_avx(KernelGlobals *kg,
+                                                      int object,
+                                                      int prim,
+                                                      float time,
+                                                      int k0, int k1,
+                                                      int k2, int k3,
+                                                      avxf *out_keys_0_1,
+                                                      avxf *out_keys_2_3)
+{
+	/* Get motion info. */
+	int numsteps, numkeys;
+	object_motion_info(kg, object, &numsteps, NULL, &numkeys);
+
+	/* Figure out which steps we need to fetch and their interpolation factor. */
+	int maxstep = numsteps * 2;
+	int step = min((int)(time*maxstep), maxstep - 1);
+	float t = time*maxstep - step;
+
+	/* Find attribute. */
+	AttributeElement elem;
+	int offset = find_attribute_curve_motion(kg,
+	                                         object,
+	                                         ATTR_STD_MOTION_VERTEX_POSITION,
+	                                         &elem);
+	kernel_assert(offset != ATTR_STD_NOT_FOUND);
+
+	/* Fetch key coordinates. */
+	float4 next_keys[4];
+	float4 keys[4];
+	motion_cardinal_curve_keys_for_step(kg,
+	                                    offset,
+	                                    numkeys,
+	                                    numsteps,
+	                                    step,
+	                                    k0, k1, k2, k3,
+	                                    keys);
+	motion_cardinal_curve_keys_for_step(kg,
+	                                    offset,
+	                                    numkeys,
+	                                    numsteps,
+	                                    step + 1,
+	                                    k0, k1, k2, k3,
+	                                    next_keys);
+
+	const avxf keys_0_1 = avxf(keys[0].m128, keys[1].m128);
+	const avxf keys_2_3 = avxf(keys[2].m128, keys[3].m128);
+	const avxf next_keys_0_1 = avxf(next_keys[0].m128, next_keys[1].m128);
+	const avxf next_keys_2_3 = avxf(next_keys[2].m128, next_keys[3].m128);
+
+	/* Interpolate between steps. */
+	*out_keys_0_1 = (1.0f - t) * keys_0_1 + t*next_keys_0_1;
+	*out_keys_2_3 = (1.0f - t) * keys_2_3 + t*next_keys_2_3;
+}
+#endif
+
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index 3cbe59aaece..4e84aa97776 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -76,7 +76,7 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, uint4
 		normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
 	}
 	else {
-		/* center step not stored in this array */
+		/* center step is not stored in this array */
 		if(step > numsteps)
 			step--;
 
@@ -117,312 +117,4 @@ ccl_device_inline void motion_triangle_vertices(KernelGlobals *kg, int object, i
 	verts[2] = (1.0f - t)*verts[2] + t*next_verts[2];
 }
 
-/* Refine triangle intersection to more precise hit point. For rays that travel
- * far the precision is often not so good, this reintersects the primitive from
- * a closer distance. */
-
-ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3])
-{
-	float3 P = ray->P;
-	float3 D = ray->D;
-	float t = isect->t;
-
-#ifdef __INTERSECTION_REFINE__
-	if(isect->object != OBJECT_NONE) {
-		if(UNLIKELY(t == 0.0f)) {
-			return P;
-		}
-#  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
-#  else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D*t);
-		D = normalize_len(D, &t);
-	}
-
-	P = P + D*t;
-
-	/* compute refined intersection distance */
-	const float3 e1 = verts[0] - verts[2];
-	const float3 e2 = verts[1] - verts[2];
-	const float3 s1 = cross(D, e2);
-
-	const float invdivisor = 1.0f/dot(s1, e1);
-	const float3 d = P - verts[2];
-	const float3 s2 = cross(d, e1);
-	float rt = dot(e2, s2)*invdivisor;
-
-	/* compute refined position */
-	P = P + D*rt;
-
-	if(isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
-#  else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-#else
-	return P + D*t;
-#endif
-}
-
-/* Same as above, except that isect->t is assumed to be in object space for instancing */
-
-#ifdef __SUBSURFACE__
-#  if defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86))
-ccl_device_noinline
-#  else
-ccl_device_inline
-#  endif
-float3 motion_triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3])
-{
-	float3 P = ray->P;
-	float3 D = ray->D;
-	float t = isect->t;
-
-#  ifdef __INTERSECTION_REFINE__
-	if(isect->object != OBJECT_NONE) {
-#    ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
-#    else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#    endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D);
-		D = normalize(D);
-	}
-
-	P = P + D*t;
-
-	/* compute refined intersection distance */
-	const float3 e1 = verts[0] - verts[2];
-	const float3 e2 = verts[1] - verts[2];
-	const float3 s1 = cross(D, e2);
-
-	const float invdivisor = 1.0f/dot(s1, e1);
-	const float3 d = P - verts[2];
-	const float3 s2 = cross(d, e1);
-	float rt = dot(e2, s2)*invdivisor;
-
-	P = P + D*rt;
-
-	if(isect->object != OBJECT_NONE) {
-#    ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
-#    else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#    endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-#  else
-	return P + D*t;
-#  endif
-}
-#endif
-
-/* Setup of motion triangle specific parts of ShaderData, moved into this one
- * function to more easily share computation of interpolated positions and
- * normals */
-
-/* return 3 triangle vertex normals */
-ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface)
-{
-	/* get shader */
-	ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
-
-	/* get motion info */
-	int numsteps, numverts;
-	object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);
-
-	/* figure out which steps we need to fetch and their interpolation factor */
-	int maxstep = numsteps*2;
-	int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
-	float t = ccl_fetch(sd, time)*maxstep - step;
-
-	/* find attribute */
-	AttributeElement elem;
-	int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_POSITION, &elem);
-	kernel_assert(offset != ATTR_STD_NOT_FOUND);
-
-	/* fetch vertex coordinates */
-	float3 verts[3], next_verts[3];
-	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
-
-	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
-	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
-
-	/* interpolate between steps */
-	verts[0] = (1.0f - t)*verts[0] + t*next_verts[0];
-	verts[1] = (1.0f - t)*verts[1] + t*next_verts[1];
-	verts[2] = (1.0f - t)*verts[2] + t*next_verts[2];
-
-	/* compute refined position */
-#ifdef __SUBSURFACE__
-	if(!subsurface)
-#endif
-		ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
-#ifdef __SUBSURFACE__
-	else
-		ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
-#endif
-
-	/* compute face normal */
-	float3 Ng;
-	if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
-		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
-	else
-		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
-
-	ccl_fetch(sd, Ng) = Ng;
-	ccl_fetch(sd, N) = Ng;
-
-	/* compute derivatives of P w.r.t. uv */
-#ifdef __DPDU__
-	ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
-	ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
-#endif
-
-	/* compute smooth normal */
-	if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
-		/* find attribute */
-		AttributeElement elem;
-		int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
-		kernel_assert(offset != ATTR_STD_NOT_FOUND);
-
-		/* fetch vertex coordinates */
-		float3 normals[3], next_normals[3];
-		motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals);
-		motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_normals);
-
-		/* interpolate between steps */
-		normals[0] = (1.0f - t)*normals[0] + t*next_normals[0];
-		normals[1] = (1.0f - t)*normals[1] + t*next_normals[1];
-		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
-
-		/* interpolate between vertices */
-		float u = ccl_fetch(sd, u);
-		float v = ccl_fetch(sd, v);
-		float w = 1.0f - u - v;
-		ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
-	}
-}
-
-/* Ray intersection. We simply compute the vertex positions at the given ray
- * time and do a ray intersection with the resulting triangle */
-
-ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 dir, float time, uint visibility, int object, int triAddr)
-{
-	/* primitive index for vertex location lookup */
-	int prim = kernel_tex_fetch(__prim_index, triAddr);
-	int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, triAddr): object;
-
-	/* get vertex locations for intersection */
-	float3 verts[3];
-	motion_triangle_vertices(kg, fobject, prim, time, verts);
-
-	/* ray-triangle intersection, unoptimized */
-	float t, u, v;
-
-	if(ray_triangle_intersect_uv(P, dir, isect->t, verts[2], verts[0], verts[1], &u, &v, &t)) {
-#ifdef __VISIBILITY_FLAG__
-		/* visibility flag test. we do it here under the assumption
-		 * that most triangles are culled by node flags */
-		if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
-#endif
-		{
-			isect->t = t;
-			isect->u = u;
-			isect->v = v;
-			isect->prim = triAddr;
-			isect->object = object;
-			isect->type = PRIMITIVE_MOTION_TRIANGLE;
-		
-			return true;
-		}
-	}
-
-	return false;
-}
-
-/* Special ray intersection routines for subsurface scattering. In that case we
- * only want to intersect with primitives in the same object, and if case of
- * multiple hits we pick a single random primitive as the intersection point. */
-
-#ifdef __SUBSURFACE__
-ccl_device_inline void motion_triangle_intersect_subsurface(
-        KernelGlobals *kg,
-        SubsurfaceIntersection *ss_isect,
-        float3 P,
-        float3 dir,
-        float time,
-        int object,
-        int triAddr,
-        float tmax,
-        uint *lcg_state,
-        int max_hits)
-{
-	/* primitive index for vertex location lookup */
-	int prim = kernel_tex_fetch(__prim_index, triAddr);
-	int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, triAddr): object;
-
-	/* get vertex locations for intersection */
-	float3 verts[3];
-	motion_triangle_vertices(kg, fobject, prim, time, verts);
-
-	/* ray-triangle intersection, unoptimized */
-	float t, u, v;
-
-	if(ray_triangle_intersect_uv(P, dir, tmax, verts[2], verts[0], verts[1], &u, &v, &t)) {
-		for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) {
-			if(ss_isect->hits[i].t == t) {
-				return;
-			}
-		}
-
-		ss_isect->num_hits++;
-
-		int hit;
-
-		if(ss_isect->num_hits <= max_hits) {
-			hit = ss_isect->num_hits - 1;
-		}
-		else {
-			/* reservoir sampling: if we are at the maximum number of
-			 * hits, randomly replace element or skip it */
-			hit = lcg_step_uint(lcg_state) % ss_isect->num_hits;
-
-			if(hit >= max_hits)
-				return;
-		}
-
-		/* record intersection */
-		Intersection *isect = &ss_isect->hits[hit];
-		isect->t = t;
-		isect->u = u;
-		isect->v = v;
-		isect->prim = triAddr;
-		isect->object = object;
-		isect->type = PRIMITIVE_MOTION_TRIANGLE;
-
-		/* Record geometric normal. */
-		ss_isect->Ng[hit] = normalize(cross(verts[1] - verts[0],
-		                                    verts[2] - verts[0]));
-	}
-}
-#endif
-
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
new file mode 100644
index 00000000000..f74995becf5
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Motion Triangle Primitive
+ *
+ * These are stored as regular triangles, plus extra positions and normals at
+ * times other than the frame center. Computing the triangle vertex positions
+ * or normals at a given ray time is a matter of interpolation of the two steps
+ * between which the ray time lies.
+ *
+ * The extra positions and normals are stored as ATTR_STD_MOTION_VERTEX_POSITION
+ * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Refine triangle intersection to more precise hit point. For rays that travel
+ * far the precision is often not so good, this reintersects the primitive from
+ * a closer distance.
+ */
+
+ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                const Intersection *isect,
+                                                const Ray *ray,
+                                                float3 verts[3])
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#ifdef __INTERSECTION_REFINE__
+	if(isect->object != OBJECT_NONE) {
+		if(UNLIKELY(t == 0.0f)) {
+			return P;
+		}
+#  ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_itfm;
+#  else
+		Transform tfm = object_fetch_transform(kg,
+		                                       isect->object,
+		                                       OBJECT_INVERSE_TRANSFORM);
+#  endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D*t);
+		D = normalize_len(D, &t);
+	}
+
+	P = P + D*t;
+
+	/* Compute refined intersection distance. */
+	const float3 e1 = verts[0] - verts[2];
+	const float3 e2 = verts[1] - verts[2];
+	const float3 s1 = cross(D, e2);
+
+	const float invdivisor = 1.0f/dot(s1, e1);
+	const float3 d = P - verts[2];
+	const float3 s2 = cross(d, e1);
+	float rt = dot(e2, s2)*invdivisor;
+
+	/* Compute refined position. */
+	P = P + D*rt;
+
+	if(isect->object != OBJECT_NONE) {
+#  ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_tfm;
+#  else
+		Transform tfm = object_fetch_transform(kg,
+		                                       isect->object,
+		                                       OBJECT_TRANSFORM);
+#  endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#else
+	return P + D*t;
+#endif
+}
+
+/* Same as above, except that isect->t is assumed to be in object space
+ * for instancing.
+ */
+
+#ifdef __SUBSURFACE__
+#  if defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86))
+ccl_device_noinline
+#  else
+ccl_device_inline
+#  endif
+float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         const Intersection *isect,
+                                         const Ray *ray,
+                                         float3 verts[3])
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#  ifdef __INTERSECTION_REFINE__
+	if(isect->object != OBJECT_NONE) {
+#    ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_itfm;
+#    else
+		Transform tfm = object_fetch_transform(kg,
+		                                       isect->object,
+		                                       OBJECT_INVERSE_TRANSFORM);
+#    endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D);
+		D = normalize(D);
+	}
+
+	P = P + D*t;
+
+	/* compute refined intersection distance */
+	const float3 e1 = verts[0] - verts[2];
+	const float3 e2 = verts[1] - verts[2];
+	const float3 s1 = cross(D, e2);
+
+	const float invdivisor = 1.0f/dot(s1, e1);
+	const float3 d = P - verts[2];
+	const float3 s2 = cross(d, e1);
+	float rt = dot(e2, s2)*invdivisor;
+
+	P = P + D*rt;
+
+	if(isect->object != OBJECT_NONE) {
+#    ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_tfm;
+#    else
+		Transform tfm = object_fetch_transform(kg,
+		                                       isect->object,
+		                                       OBJECT_TRANSFORM);
+#    endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#  else  /* __INTERSECTION_REFINE__ */
+	return P + D*t;
+#  endif  /* __INTERSECTION_REFINE__ */
+}
+#endif  /* __SUBSURFACE__ */
+
+
+/* Ray intersection. We simply compute the vertex positions at the given ray
+ * time and do a ray intersection with the resulting triangle.
+ */
+
+ccl_device_inline bool motion_triangle_intersect(
+        KernelGlobals *kg,
+        Intersection *isect,
+        float3 P,
+        float3 dir,
+        float time,
+        uint visibility,
+        int object,
+        int prim_addr)
+{
+	/* Primitive index for vertex location lookup. */
+	int prim = kernel_tex_fetch(__prim_index, prim_addr);
+	int fobject = (object == OBJECT_NONE)
+	                  ? kernel_tex_fetch(__prim_object, prim_addr)
+	                  : object;
+	/* Get vertex locations for intersection. */
+	float3 verts[3];
+	motion_triangle_vertices(kg, fobject, prim, time, verts);
+	/* Ray-triangle intersection, unoptimized. */
+	float t, u, v;
+	if(ray_triangle_intersect(P,
+	                          dir,
+	                          isect->t,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                          (ssef*)verts,
+#else
+	                          verts[0], verts[1], verts[2],
+#endif
+	                          &u, &v, &t))
+	{
+#ifdef __VISIBILITY_FLAG__
+		/* Visibility flag test. we do it here under the assumption
+		 * that most triangles are culled by node flags.
+		 */
+		if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility)
+#endif
+		{
+			isect->t = t;
+			isect->u = u;
+			isect->v = v;
+			isect->prim = prim_addr;
+			isect->object = object;
+			isect->type = PRIMITIVE_MOTION_TRIANGLE;
+			return true;
+		}
+	}
+	return false;
+}
+
+/* Special ray intersection routines for subsurface scattering. In that case we
+ * only want to intersect with primitives in the same object, and if case of
+ * multiple hits we pick a single random primitive as the intersection point.
+ */
+#ifdef __SUBSURFACE__
+ccl_device_inline void motion_triangle_intersect_subsurface(
+        KernelGlobals *kg,
+        SubsurfaceIntersection *ss_isect,
+        float3 P,
+        float3 dir,
+        float time,
+        int object,
+        int prim_addr,
+        float tmax,
+        uint *lcg_state,
+        int max_hits)
+{
+	/* Primitive index for vertex location lookup. */
+	int prim = kernel_tex_fetch(__prim_index, prim_addr);
+	int fobject = (object == OBJECT_NONE)
+	                  ? kernel_tex_fetch(__prim_object, prim_addr)
+	                  : object;
+	/* Get vertex locations for intersection. */
+	float3 verts[3];
+	motion_triangle_vertices(kg, fobject, prim, time, verts);
+	/* Ray-triangle intersection, unoptimized. */
+	float t, u, v;
+	if(ray_triangle_intersect(P,
+	                          dir,
+	                          tmax,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                          (ssef*)verts,
+#else
+	                          verts[0], verts[1], verts[2],
+#endif
+	                          &u, &v, &t))
+	{
+		for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) {
+			if(ss_isect->hits[i].t == t) {
+				return;
+			}
+		}
+		ss_isect->num_hits++;
+		int hit;
+		if(ss_isect->num_hits <= max_hits) {
+			hit = ss_isect->num_hits - 1;
+		}
+		else {
+			/* Reservoir sampling: if we are at the maximum number of
+			 * hits, randomly replace element or skip it.
+			 */
+			hit = lcg_step_uint(lcg_state) % ss_isect->num_hits;
+
+			if(hit >= max_hits)
+				return;
+		}
+		/* Record intersection. */
+		Intersection *isect = &ss_isect->hits[hit];
+		isect->t = t;
+		isect->u = u;
+		isect->v = v;
+		isect->prim = prim_addr;
+		isect->object = object;
+		isect->type = PRIMITIVE_MOTION_TRIANGLE;
+		/* Record geometric normal. */
+		ss_isect->Ng[hit] = normalize(cross(verts[1] - verts[0],
+		                                    verts[2] - verts[0]));
+	}
+}
+#endif  /* __SUBSURFACE__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
new file mode 100644
index 00000000000..cb456056e20
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Motion Triangle Primitive
+ *
+ * These are stored as regular triangles, plus extra positions and normals at
+ * times other than the frame center. Computing the triangle vertex positions
+ * or normals at a given ray time is a matter of interpolation of the two steps
+ * between which the ray time lies.
+ *
+ * The extra positions and normals are stored as ATTR_STD_MOTION_VERTEX_POSITION
+ * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Setup of motion triangle specific parts of ShaderData, moved into this one
+ * function to more easily share computation of interpolated positions and
+ * normals */
+
+/* return 3 triangle vertex normals */
+ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
+                                                      ShaderData *sd, const
+                                                      Intersection *isect,
+                                                      const Ray *ray,
+                                                      bool subsurface)
+{
+	/* Get shader. */
+	sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+	/* Get motion info. */
+	/* TODO(sergey): This logic is really similar to motion_triangle_vertices(),
+	 * can we de-duplicate something here?
+	 */
+	int numsteps, numverts;
+	object_motion_info(kg, sd->object, &numsteps, &numverts, NULL);
+	/* Figure out which steps we need to fetch and their interpolation factor. */
+	int maxstep = numsteps*2;
+	int step = min((int)(sd->time*maxstep), maxstep-1);
+	float t = sd->time*maxstep - step;
+	/* Find attribute. */
+	AttributeElement elem;
+	int offset = find_attribute_motion(kg, sd->object,
+	                                   ATTR_STD_MOTION_VERTEX_POSITION,
+	                                   &elem);
+	kernel_assert(offset != ATTR_STD_NOT_FOUND);
+	/* Fetch vertex coordinates. */
+	float3 verts[3], next_verts[3];
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
+	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
+	/* Interpolate between steps. */
+	verts[0] = (1.0f - t)*verts[0] + t*next_verts[0];
+	verts[1] = (1.0f - t)*verts[1] + t*next_verts[1];
+	verts[2] = (1.0f - t)*verts[2] + t*next_verts[2];
+	/* Compute refined position. */
+#ifdef __SUBSURFACE__
+	if(subsurface) {
+		sd->P = motion_triangle_refine_subsurface(kg,
+		                                                     sd,
+		                                                     isect,
+		                                                     ray,
+		                                                     verts);
+	}
+	else
+#endif  /*  __SUBSURFACE__*/
+	{
+		sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
+	}
+	/* Compute face normal. */
+	float3 Ng;
+	if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
+	}
+	else {
+		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
+	}
+	sd->Ng = Ng;
+	sd->N = Ng;
+	/* Compute derivatives of P w.r.t. uv. */
+#ifdef __DPDU__
+	sd->dPdu = (verts[0] - verts[2]);
+	sd->dPdv = (verts[1] - verts[2]);
+#endif
+	/* Compute smooth normal. */
+	if(sd->shader & SHADER_SMOOTH_NORMAL) {
+		/* Find attribute. */
+		AttributeElement elem;
+		int offset = find_attribute_motion(kg,
+		                                   sd->object,
+		                                   ATTR_STD_MOTION_VERTEX_NORMAL,
+		                                   &elem);
+		kernel_assert(offset != ATTR_STD_NOT_FOUND);
+		/* Fetch vertex coordinates. */
+		float3 normals[3], next_normals[3];
+		motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals);
+		motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_normals);
+		/* Interpolate between steps. */
+		normals[0] = (1.0f - t)*normals[0] + t*next_normals[0];
+		normals[1] = (1.0f - t)*normals[1] + t*next_normals[1];
+		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
+		/* Interpolate between vertices. */
+		float u = sd->u;
+		float v = sd->v;
+		float w = 1.0f - u - v;
+		sd->N = (u*normals[0] + v*normals[1] + w*normals[2]);
+	}
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index 9f0fe032ba4..1ffc143be34 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -113,7 +113,6 @@ ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg, int
 ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg, int object, float time, Transform *itfm)
 {
 	int object_flag = kernel_tex_fetch(__object_flag, object);
-
 	if(object_flag & SD_OBJECT_MOTION) {
 		/* if we do motion blur */
 		Transform tfm = object_fetch_transform_motion(kg, object, time);
@@ -138,9 +137,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point_auto(&ccl_fetch(sd, ob_tfm), *P);
+	*P = transform_point_auto(&sd->ob_tfm, *P);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -150,9 +149,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader
 ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point_auto(&ccl_fetch(sd, ob_itfm), *P);
+	*P = transform_point_auto(&sd->ob_itfm, *P);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -162,12 +161,12 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons
 ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	if((ccl_fetch(sd, object) != OBJECT_NONE) || (ccl_fetch(sd, type) == PRIMITIVE_LAMP)) {
-		*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N));
+	if((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
+		*N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N));
 	}
 #else
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
-		Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	if(sd->object != OBJECT_NONE) {
+		Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 		*N = normalize(transform_direction_transposed(&tfm, *N));
 	}
 #endif
@@ -178,9 +177,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const
 ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_itfm), *N));
+	*N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N));
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@@ -190,9 +189,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa
 ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction_auto(&ccl_fetch(sd, ob_tfm), *D);
+	*D = transform_direction_auto(&sd->ob_tfm, *D);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -202,9 +201,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData
 ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction_auto(&ccl_fetch(sd, ob_itfm), *D);
+	*D = transform_direction_auto(&sd->ob_itfm, *D);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -213,13 +212,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha
 
 ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
 {
-	if(ccl_fetch(sd, object) == OBJECT_NONE)
+	if(sd->object == OBJECT_NONE)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
 #ifdef __OBJECT_MOTION__
-	return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w);
+	return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
 #endif
 }
@@ -327,7 +326,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
 
 ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 {
-	return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE + 1);
+	return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE + 1);
 }
 
 /* Particle data from which object was instanced */
@@ -416,17 +415,18 @@ ccl_device_inline float3 bvh_clamp_direction(float3 dir)
 
 ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
 	return rcp(dir);
-#else
-	return 1.0f / dir;
-#endif
 }
 
 /* Transform ray into object space to enter static object in BVH */
 
-ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
+ccl_device_inline float bvh_instance_push(KernelGlobals *kg,
+                                          int object,
+                                          const Ray *ray,
+                                          float3 *P,
+                                          float3 *dir,
+                                          float3 *idir,
+                                          float t)
 {
 	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 
@@ -436,8 +436,11 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra
 	*dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
 	*idir = bvh_inverse_direction(*dir);
 
-	if(*t != FLT_MAX)
-		*t *= len;
+	if(t != FLT_MAX) {
+		t *= len;
+	}
+
+	return t;
 }
 
 #ifdef __QBVH__
@@ -474,16 +477,24 @@ ccl_device_inline void qbvh_instance_push(KernelGlobals *kg,
 
 /* Transorm ray to exit static object in BVH */
 
-ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
+ccl_device_inline float bvh_instance_pop(KernelGlobals *kg,
+                                         int object,
+                                         const Ray *ray,
+                                         float3 *P,
+                                         float3 *dir,
+                                         float3 *idir,
+                                         float t)
 {
-	if(*t != FLT_MAX) {
+	if(t != FLT_MAX) {
 		Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-		*t /= len(transform_direction(&tfm, ray->D));
+		t /= len(transform_direction(&tfm, ray->D));
 	}
 
 	*P = ray->P;
 	*dir = bvh_clamp_direction(ray->D);
 	*idir = bvh_inverse_direction(*dir);
+
+	return t;
 }
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
@@ -502,13 +513,13 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, int object, co
 #ifdef __OBJECT_MOTION__
 /* Transform ray into object space to enter motion blurred object in BVH */
 
-ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg,
+ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
                                                 int object,
                                                 const Ray *ray,
                                                 float3 *P,
                                                 float3 *dir,
                                                 float3 *idir,
-                                                ccl_addr_space float *t,
+                                                float t,
                                                 Transform *itfm)
 {
 	object_fetch_transform_motion_test(kg, object, ray->time, itfm);
@@ -519,8 +530,11 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg,
 	*dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len));
 	*idir = bvh_inverse_direction(*dir);
 
-	if(*t != FLT_MAX)
-		*t *= len;
+	if(t != FLT_MAX) {
+		t *= len;
+	}
+
+	return t;
 }
 
 #ifdef __QBVH__
@@ -558,22 +572,24 @@ ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg,
 
 /* Transorm ray to exit motion blurred object in BVH */
 
-ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg,
-                                               int object,
-                                               const Ray *ray,
-                                               float3 *P,
-                                               float3 *dir,
-                                               float3 *idir,
-                                               ccl_addr_space float *t,
-                                               Transform *itfm)
-{
-	if(*t != FLT_MAX) {
-		*t /= len(transform_direction(itfm, ray->D));
+ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
+                                                int object,
+                                                const Ray *ray,
+                                                float3 *P,
+                                                float3 *dir,
+                                                float3 *idir,
+                                                float t,
+                                                Transform *itfm)
+{
+	if(t != FLT_MAX) {
+		t /= len(transform_direction(itfm, ray->D));
 	}
 
 	*P = ray->P;
 	*dir = bvh_clamp_direction(ray->D);
 	*idir = bvh_inverse_direction(*dir);
+
+	return t;
 }
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h
index 6a0ff5a4a04..5663b598508 100644
--- a/intern/cycles/kernel/geom/geom_patch.h
+++ b/intern/cycles/kernel/geom/geom_patch.h
@@ -267,7 +267,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg, const ShaderData *sd, int o
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float val = 0.0f;
@@ -294,7 +294,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg, const ShaderData *sd, int
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float3 val = make_float3(0.0f, 0.0f, 0.0f);
@@ -321,7 +321,7 @@ ccl_device float3 patch_eval_uchar4(KernelGlobals *kg, const ShaderData *sd, int
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float3 val = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index dbf0b804b5d..989f1574e94 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -28,19 +28,19 @@ ccl_device_inline float primitive_attribute_float(KernelGlobals *kg,
                                                   const AttributeDescriptor desc,
                                                   float *dx, float *dy)
 {
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
+	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
 		if(subd_triangle_patch(kg, sd) == ~0)
 			return triangle_attribute_float(kg, sd, desc, dx, dy);
 		else
 			return subd_triangle_attribute_float(kg, sd, desc, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	else if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float(kg, sd, desc, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_VOLUME) {
+	else if(sd->type & PRIMITIVE_ALL_VOLUME) {
 		return volume_attribute_float(kg, sd, desc, dx, dy);
 	}
 #endif
@@ -56,19 +56,19 @@ ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg,
                                                     const AttributeDescriptor desc,
                                                     float3 *dx, float3 *dy)
 {
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
+	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
 		if(subd_triangle_patch(kg, sd) == ~0)
 			return triangle_attribute_float3(kg, sd, desc, dx, dy);
 		else
 			return subd_triangle_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	else if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_VOLUME) {
+	else if(sd->type & PRIMITIVE_ALL_VOLUME) {
 		return volume_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #endif
@@ -118,9 +118,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)
+	if(sd->type & PRIMITIVE_ALL_CURVE)
 #  ifdef __DPDU__
-		return normalize(ccl_fetch(sd, dPdu));
+		return normalize(sd->dPdu);
 #  else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #  endif
@@ -133,12 +133,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 		float3 data = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 		data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f);
 		object_normal_transform(kg, sd, &data);
-		return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N))));
+		return cross(sd->N, normalize(cross(data, sd->N)));
 	}
 	else {
 		/* otherwise use surface derivatives */
 #ifdef __DPDU__
-		return normalize(ccl_fetch(sd, dPdu));
+		return normalize(sd->dPdu);
 #else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@@ -153,16 +153,17 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	float3 center;
 
 #ifdef __HAIR__
-	bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE;
+	bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE;
 	if(is_curve_primitive) {
 		center = curve_motion_center_location(kg, sd);
 
-		if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED))
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &center);
+		}
 	}
 	else
 #endif
-		center = ccl_fetch(sd, P);
+		center = sd->P;
 
 	float3 motion_pre = center, motion_post = center;
 
@@ -172,16 +173,16 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	if(desc.offset != ATTR_STD_NOT_FOUND) {
 		/* get motion info */
 		int numverts, numkeys;
-		object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys);
+		object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
 
 		/* lookup attributes */
 		motion_pre = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
-		desc.offset += (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys;
+		desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys;
 		motion_post = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
 #ifdef __HAIR__
-		if(is_curve_primitive && (ccl_fetch(sd, flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+		if(is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
 			object_position_transform(kg, sd, &motion_pre);
 			object_position_transform(kg, sd, &motion_post);
 		}
@@ -192,10 +193,10 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	 * transformation was set match the world/object space of motion_pre/post */
 	Transform tfm;
 	
-	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE);
+	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE);
 	motion_pre = transform_point(&tfm, motion_pre);
 
-	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST);
+	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST);
 	motion_post = transform_point(&tfm, motion_post);
 
 	float3 motion_center;
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 647840dc696..044e82f03d4 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -22,14 +22,14 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd)
 {
-	return (ccl_fetch(sd, prim) != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, ccl_fetch(sd, prim)) : ~0;
+	return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
 }
 
 /* UV coords of triangle within patch */
 
 ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg, const ShaderData *sd, float2 uv[3])
 {
-	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 	uv[0] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.x);
 	uv[1] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.y);
@@ -110,7 +110,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float2 dpdv = uv[1] - uv[2];
 
 		/* p is [s, t] */
-		float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2];
+		float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
 
 		float a, dads, dadt;
 		a = patch_eval_float(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
@@ -123,8 +123,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 			float dtdv = dpdv.y;
 
 			if(dx) {
-				float dudx = ccl_fetch(sd, du).dx;
-				float dvdx = ccl_fetch(sd, dv).dx;
+				float dudx = sd->du.dx;
+				float dvdx = sd->dv.dx;
 
 				float dsdx = dsdu*dudx + dsdv*dvdx;
 				float dtdx = dtdu*dudx + dtdv*dvdx;
@@ -132,8 +132,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 				*dx = dads*dsdx + dadt*dtdx;
 			}
 			if(dy) {
-				float dudy = ccl_fetch(sd, du).dy;
-				float dvdy = ccl_fetch(sd, dv).dy;
+				float dudy = sd->du.dy;
+				float dvdy = sd->dv.dy;
 
 				float dsdy = dsdu*dudy + dsdv*dvdy;
 				float dtdy = dtdu*dudy + dtdv*dvdy;
@@ -174,11 +174,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER) {
 		float2 uv[3];
@@ -202,11 +202,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else {
 		if(dx) *dx = 0.0f;
@@ -229,7 +229,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float2 dpdv = uv[1] - uv[2];
 
 		/* p is [s, t] */
-		float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2];
+		float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
 
 		float3 a, dads, dadt;
 
@@ -248,8 +248,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 			float dtdv = dpdv.y;
 
 			if(dx) {
-				float dudx = ccl_fetch(sd, du).dx;
-				float dvdx = ccl_fetch(sd, dv).dx;
+				float dudx = sd->du.dx;
+				float dvdx = sd->dv.dx;
 
 				float dsdx = dsdu*dudx + dsdv*dvdx;
 				float dtdx = dtdu*dudx + dtdv*dvdx;
@@ -257,8 +257,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 				*dx = dads*dsdx + dadt*dtdx;
 			}
 			if(dy) {
-				float dudy = ccl_fetch(sd, du).dy;
-				float dvdy = ccl_fetch(sd, dv).dy;
+				float dudy = sd->du.dy;
+				float dvdy = sd->dv.dy;
 
 				float dsdy = dsdu*dudy + dsdv*dvdy;
 				float dtdy = dtdu*dudy + dtdv*dvdy;
@@ -299,11 +299,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
 		float2 uv[3];
@@ -337,11 +337,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 17538872ead..105aee8da15 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -26,16 +26,18 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 {
 	/* load triangle vertices */
-	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 	const float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0));
 	const float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1));
 	const float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2));
 
 	/* return normal */
-	if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
+	if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
 		return normalize(cross(v2 - v0, v1 - v0));
-	else
+	}
+	else {
 		return normalize(cross(v1 - v0, v2 - v0));
+	}
 }
 
 /* point and normal on triangle  */
@@ -46,20 +48,18 @@ ccl_device_inline void triangle_point_normal(KernelGlobals *kg, int object, int
 	float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0));
 	float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1));
 	float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2));
-
 	/* compute point */
 	float t = 1.0f - u - v;
 	*P = (u*v0 + v*v1 + t*v2);
-
 	/* get object flags */
 	int object_flag = kernel_tex_fetch(__object_flag, object);
-
 	/* compute normal */
-	if(object_flag & SD_NEGATIVE_SCALE_APPLIED)
+	if(object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
 		*Ng = normalize(cross(v2 - v0, v1 - v0));
-	else
+	}
+	else {
 		*Ng = normalize(cross(v1 - v0, v2 - v0));
-
+	}
 	/* shader`*/
 	*shader = kernel_tex_fetch(__tri_shader, prim);
 }
@@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
 
 /* Interpolate smooth vertex normal from vertices */
 
-ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
+ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
 {
 	/* load triangle vertices */
 	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
 	float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
 	float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
 
-	return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+	float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+
+	return is_zero(N)? Ng: N;
 }
 
 /* Ray differentials on triangle */
@@ -110,34 +112,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
 		if(dx) *dx = 0.0f;
 		if(dy) *dy = 0.0f;
 
-		return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
+		return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
 	}
 	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
-		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 		float f0 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.x);
 		float f1 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.y);
 		float f2 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.z);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER) {
-		int tri = desc.offset + ccl_fetch(sd, prim)*3;
+		int tri = desc.offset + sd->prim*3;
 		float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
 		float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
 		float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else {
 		if(dx) *dx = 0.0f;
@@ -153,24 +155,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim));
 	}
 	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
-		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y));
 		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
-		int tri = desc.offset + ccl_fetch(sd, prim)*3;
+		int tri = desc.offset + sd->prim*3;
 		float3 f0, f1, f2;
 
 		if(desc.element == ATTR_ELEMENT_CORNER) {
@@ -185,11 +187,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		}
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index eb7340583c8..804e74d7e37 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -22,232 +22,50 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Workaround stupidness of CUDA/OpenCL which doesn't allow to access indexed
- * component of float3 value.
- */
-#ifndef __KERNEL_CPU__
-#  define IDX(vec, idx) \
-    ((idx == 0) ? ((vec).x) : ( (idx == 1) ? ((vec).y) : ((vec).z) ))
-#else
-#  define IDX(vec, idx) ((vec)[idx])
-#endif
-
-/* Ray-Triangle intersection for BVH traversal
- *
- * Sven Woop
- * Watertight Ray/Triangle Intersection
- *
- * http://jcgt.org/published/0002/01/05/paper.pdf
- */
-
-/* Precalculated data for the ray->tri intersection. */
-typedef struct IsectPrecalc {
-	/* Maximal dimension kz, and orthogonal dimensions. */
-	int kx, ky, kz;
-
-	/* Shear constants. */
-	float Sx, Sy, Sz;
-} IsectPrecalc;
-
-#if (defined(__KERNEL_OPENCL_APPLE__)) || \
-    (defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86)))
-ccl_device_noinline
-#else
-ccl_device_inline
-#endif
-void triangle_intersect_precalc(float3 dir,
-                                IsectPrecalc *isect_precalc)
-{
-	/* Calculate dimension where the ray direction is maximal. */
-#ifndef __KERNEL_SSE__
-	int kz = util_max_axis(make_float3(fabsf(dir.x),
-	                                   fabsf(dir.y),
-	                                   fabsf(dir.z)));
-	int kx = kz + 1; if(kx == 3) kx = 0;
-	int ky = kx + 1; if(ky == 3) ky = 0;
-#else
-	int kx, ky, kz;
-	/* Avoiding mispredicted branch on direction. */
-	kz = util_max_axis(fabs(dir));
-	static const char inc_xaxis[] = {1, 2, 0, 55};
-	static const char inc_yaxis[] = {2, 0, 1, 55};
-	kx = inc_xaxis[kz];
-	ky = inc_yaxis[kz];
-#endif
-
-	float dir_kz = IDX(dir, kz);
-
-	/* Swap kx and ky dimensions to preserve winding direction of triangles. */
-	if(dir_kz < 0.0f) {
-		int tmp = kx;
-		kx = ky;
-		ky = tmp;
-	}
-
-	/* Calculate the shear constants. */
-	float inv_dir_z = 1.0f / dir_kz;
-	isect_precalc->Sx = IDX(dir, kx) * inv_dir_z;
-	isect_precalc->Sy = IDX(dir, ky) * inv_dir_z;
-	isect_precalc->Sz = inv_dir_z;
-
-	/* Store the dimensions. */
-	isect_precalc->kx = kx;
-	isect_precalc->ky = ky;
-	isect_precalc->kz = kz;
-}
-
-/* TODO(sergey): Make it general utility function. */
-ccl_device_inline float xor_signmask(float x, int y)
-{
-	return __int_as_float(__float_as_int(x) ^ y);
-}
-
 ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
-                                          const IsectPrecalc *isect_precalc,
                                           Intersection *isect,
                                           float3 P,
+                                          float3 dir,
                                           uint visibility,
                                           int object,
-                                          int triAddr)
+                                          int prim_addr)
 {
-	const int kx = isect_precalc->kx;
-	const int ky = isect_precalc->ky;
-	const int kz = isect_precalc->kz;
-	const float Sx = isect_precalc->Sx;
-	const float Sy = isect_precalc->Sy;
-	const float Sz = isect_precalc->Sz;
-
-	/* Calculate vertices relative to ray origin. */
-	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr);
-
-#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
-	const avxf avxf_P(P.m128, P.m128);
-
-	const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
-	const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
-
-	const avxf AB = tri_ab - avxf_P;
-	const avxf BC = tri_bc - avxf_P;
-
-	const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
-
-	const avxf AB_k = shuffle(AB, permuteMask);
-	const avxf BC_k = shuffle(BC, permuteMask);
-
-	/* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
-	const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
-
-	/* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
-	const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
-
-	const avxf Sxy(Sy, Sx, Sy, Sx);
-
-	/* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
-	const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
-
-	float ABBC_kz_array[8];
-	_mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
-
-	const float A_kz = ABBC_kz_array[0];
-	const float B_kz = ABBC_kz_array[2];
-	const float C_kz = ABBC_kz_array[6];
-
-	/* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
-	const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
-
-	const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
-
-	/* W           U                             V
-	 * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
-	 */
-	const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */);
-
-	const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask;
-
-	/* Calculate scaled barycentric coordinates. */
-	float WUVW_array[4];
-	_mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
-
-	const float W = WUVW_array[0];
-	const float U = WUVW_array[1];
-	const float V = WUVW_array[2];
-
-	const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
-	const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
-	                                               _mm256_setzero_ps(), 0));
-
-	if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
-		return false;
-	}
+	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex];
 #else
 	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
 	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
 	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
-	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
-	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
-	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
-
-	const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
-	const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);
-	const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz);
-
-	/* Perform shear and scale of vertices. */
-	const float Ax = A_kx - Sx * A_kz;
-	const float Ay = A_ky - Sy * A_kz;
-	const float Bx = B_kx - Sx * B_kz;
-	const float By = B_ky - Sy * B_kz;
-	const float Cx = C_kx - Sx * C_kz;
-	const float Cy = C_ky - Sy * C_kz;
-
-	/* Calculate scaled barycentric coordinates. */
-	float U = Cx * By - Cy * Bx;
-	float V = Ax * Cy - Ay * Cx;
-	float W = Bx * Ay - By * Ax;
-	if((U < 0.0f || V < 0.0f || W < 0.0f) &&
-	   (U > 0.0f || V > 0.0f || W > 0.0f))
-	{
-		return false;
-	}
 #endif
-
-	/* Calculate determinant. */
-	float det = U + V + W;
-	if(UNLIKELY(det == 0.0f)) {
-		return false;
-	}
-
-	/* Calculate scaled z-coordinates of vertices and use them to calculate
-	 * the hit distance.
-	 */
-	const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
-	const int sign_det = (__float_as_int(det) & 0x80000000);
-	const float sign_T = xor_signmask(T, sign_det);
-	if((sign_T < 0.0f) ||
-	   (sign_T > isect->t * xor_signmask(det, sign_det)))
+	float t, u, v;
+	if(ray_triangle_intersect(P,
+	                          dir,
+	                          isect->t,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                          ssef_verts,
+#else
+	                          float4_to_float3(tri_a),
+	                          float4_to_float3(tri_b),
+	                          float4_to_float3(tri_c),
+#endif
+	                          &u, &v, &t))
 	{
-		return false;
-	}
-
 #ifdef __VISIBILITY_FLAG__
-	/* visibility flag test. we do it here under the assumption
-	 * that most triangles are culled by node flags */
-	if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
+		/* Visibility flag test. we do it here under the assumption
+		 * that most triangles are culled by node flags.
+		 */
+		if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility)
 #endif
-	{
-#ifdef __KERNEL_CUDA__
-		if(A == B && B == C) {
-			return false;
+		{
+			isect->prim = prim_addr;
+			isect->object = object;
+			isect->type = PRIMITIVE_TRIANGLE;
+			isect->u = u;
+			isect->v = v;
+			isect->t = t;
+			return true;
 		}
-#endif
-		/* Normalize U, V, W, and T. */
-		const float inv_det = 1.0f / det;
-		isect->prim = triAddr;
-		isect->object = object;
-		isect->type = PRIMITIVE_TRIANGLE;
-		isect->u = U * inv_det;
-		isect->v = V * inv_det;
-		isect->t = T * inv_det;
-		return true;
 	}
 	return false;
 }
@@ -260,138 +78,37 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 #ifdef __SUBSURFACE__
 ccl_device_inline void triangle_intersect_subsurface(
         KernelGlobals *kg,
-        const IsectPrecalc *isect_precalc,
         SubsurfaceIntersection *ss_isect,
         float3 P,
+        float3 dir,
         int object,
-        int triAddr,
+        int prim_addr,
         float tmax,
         uint *lcg_state,
         int max_hits)
 {
-	const int kx = isect_precalc->kx;
-	const int ky = isect_precalc->ky;
-	const int kz = isect_precalc->kz;
-	const float Sx = isect_precalc->Sx;
-	const float Sy = isect_precalc->Sy;
-	const float Sz = isect_precalc->Sz;
-
-	/* Calculate vertices relative to ray origin. */
-	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr);
-	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
-	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
-	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
-
-#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
-	const avxf avxf_P(P.m128, P.m128);
-
-	const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
-	const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
-
-	const avxf AB = tri_ab - avxf_P;
-	const avxf BC = tri_bc - avxf_P;
-
-	const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
-
-	const avxf AB_k = shuffle(AB, permuteMask);
-	const avxf BC_k = shuffle(BC, permuteMask);
-
-	/* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
-	const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
-
-	/* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
-	const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
-
-	const avxf Sxy(Sy, Sx, Sy, Sx);
-
-	/* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
-	const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
-
-	float ABBC_kz_array[8];
-	_mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
-
-	const float A_kz = ABBC_kz_array[0];
-	const float B_kz = ABBC_kz_array[2];
-	const float C_kz = ABBC_kz_array[6];
-
-	/* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
-	const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
-
-	const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
-
-	/* W           U                             V
-	 * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
-	 */
-	const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */);
-
-	const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask;
-
-	/* Calculate scaled barycentric coordinates. */
-	float WUVW_array[4];
-	_mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
-
-	const float W = WUVW_array[0];
-	const float U = WUVW_array[1];
-	const float V = WUVW_array[2];
-
-	const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
-	const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
-	                                               _mm256_setzero_ps(), 0));
-
-	if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
-		return;
-	}
+	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex];
 #else
-	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
-	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
-	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
-
-	const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
-	const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);
-	const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz);
-
-	/* Perform shear and scale of vertices. */
-	const float Ax = A_kx - Sx * A_kz;
-	const float Ay = A_ky - Sy * A_kz;
-	const float Bx = B_kx - Sx * B_kz;
-	const float By = B_ky - Sy * B_kz;
-	const float Cx = C_kx - Sx * C_kz;
-	const float Cy = C_ky - Sy * C_kz;
-
-	/* Calculate scaled barycentric coordinates. */
-	float U = Cx * By - Cy * Bx;
-	float V = Ax * Cy - Ay * Cx;
-	float W = Bx * Ay - By * Ax;
-
-	if((U < 0.0f || V < 0.0f || W < 0.0f) &&
-	   (U > 0.0f || V > 0.0f || W > 0.0f))
-	{
-		return;
-	}
+	const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)),
+	             tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)),
+	             tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2));
 #endif
-
-	/* Calculate determinant. */
-	float det = U + V + W;
-	if(UNLIKELY(det == 0.0f)) {
-		return;
-	}
-
-	/* Calculate scaled z−coordinates of vertices and use them to calculate
-	 * the hit distance.
-	 */
-	const int sign_det = (__float_as_int(det) & 0x80000000);
-	const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
-	const float sign_T = xor_signmask(T, sign_det);
-	if((sign_T < 0.0f) ||
-	   (sign_T > tmax * xor_signmask(det, sign_det)))
+	float t, u, v;
+	if(!ray_triangle_intersect(P,
+	                           dir,
+	                           tmax,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                           ssef_verts,
+#else
+	                           tri_a, tri_b, tri_c,
+#endif
+	                           &u, &v, &t))
 	{
 		return;
 	}
 
-	/* Normalize U, V, W, and T. */
-	const float inv_det = 1.0f / det;
-
-	const float t = T * inv_det;
 	for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) {
 		if(ss_isect->hits[i].t == t) {
 			return;
@@ -415,21 +132,22 @@ ccl_device_inline void triangle_intersect_subsurface(
 
 	/* record intersection */
 	Intersection *isect = &ss_isect->hits[hit];
-	isect->prim = triAddr;
+	isect->prim = prim_addr;
 	isect->object = object;
 	isect->type = PRIMITIVE_TRIANGLE;
-	isect->u = U * inv_det;
-	isect->v = V * inv_det;
+	isect->u = u;
+	isect->v = v;
 	isect->t = t;
 
 	/* Record geometric normal. */
-	/* TODO(sergey): Use float4_to_float3() on just an edges. */
-	const float3 v0 = float4_to_float3(tri_a);
-	const float3 v1 = float4_to_float3(tri_b);
-	const float3 v2 = float4_to_float3(tri_c);
-	ss_isect->Ng[hit] = normalize(cross(v1 - v0, v2 - v0));
-}
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)),
+	             tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)),
+	             tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2));
 #endif
+	ss_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
+}
+#endif  /* __SUBSURFACE__ */
 
 /* Refine triangle intersection to more precise hit point. For rays that travel
  * far the precision is often not so good, this reintersects the primitive from
@@ -457,7 +175,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 			return P;
 		}
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #  else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #  endif
@@ -491,7 +209,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #  else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #  endif
@@ -519,7 +237,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -557,7 +275,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -570,6 +288,4 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 	return P;
 }
 
-#undef IDX
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 28ea80f1a65..9a5b94c1f46 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -33,21 +33,6 @@ CCL_NAMESPACE_BEGIN
 
 /* Return position normalized to 0..1 in mesh bounds */
 
-#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300
-ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
-{
-	float4 r;
-	switch(id) {
-		case 0: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_000, x, y, z); break;
-		case 1: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_001, x, y, z); break;
-		case 2: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_002, x, y, z); break;
-		case 3: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_003, x, y, z); break;
-		case 4: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_004, x, y, z); break;
-	}
-	return r;
-}
-#endif  /* __KERNEL_CUDA__ */
-
 ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
                                                     const ShaderData *sd,
                                                     float3 P)
@@ -68,39 +53,14 @@ ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
 
 ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
 {
-	float r;
-	
-#ifdef __KERNEL_CUDA__
-#  if __CUDA_ARCH__ >= 300
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
-	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
-	r = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z);
-#  else
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
-	r = average(float4_to_float3(volume_image_texture_3d(desc.offset, P.x, P.y, P.z)));
-#  endif
-#elif defined(__KERNEL_OPENCL__)
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
-	r = average(float4_to_float3(kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z)));
-#else
-
-#if 1 /* XXX WITH_OPENVDB ? */
-	float3 P = ccl_fetch(sd, P);
-	/* XXX OpenVDB does not support cubic interpolation (could use quadratic though) - lukas_t */
-#if 0
-	if(sd->flag & SD_VOLUME_CUBIC)
-		r = kernel_tex_voxel_float(desc.offset, P.x, P.y, P.z, ...)
-	else
-#endif
-		r = kernel_tex_voxel_float(desc.offset, P.x, P.y, P.z, OPENVDB_SAMPLE_BOX);
+#ifdef __OPENVDB__
+	float3 P = sd->P;
+	/* XXX OpenVDB does not support cubic interpolation - lukas_t */
+	float r = kernel_tex_voxel_float(desc.offset, P.x, P.y, P.z, OPENVDB_SAMPLE_BOX);
 #else
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
-	if(sd->flag & SD_VOLUME_CUBIC)
-		r = average(float4_to_float3(kernel_tex_image_interp_3d_ex(desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC)));
-	else
-		r = average(float4_to_float3(kernel_tex_image_interp_3d(desc.offset, P.x, P.y, P.z)));
-#endif
-
+	float3 P = volume_normalized_position(kg, sd, sd->P);
+	InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC)? INTERPOLATION_CUBIC: INTERPOLATION_NONE;
+	float r = average(float4_to_float3(kernel_tex_image_interp_3d_float(kg, desc.offset, P.x, P.y, P.z, interp)));
 #endif
 
 	if(dx) *dx = 0.0f;
@@ -111,33 +71,14 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 
 ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
 {
-	float3 r;
-	
-#ifdef __KERNEL_CUDA__
-#  if __CUDA_ARCH__ >= 300
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
-	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
-	r = float4_to_float3(kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z));
-#  else
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
-	r = float4_to_float3(volume_image_texture_3d(desc.offset, P.x, P.y, P.z));
-#  endif
-#elif defined(__KERNEL_OPENCL__)
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
-	r = float4_to_float3(kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z));
+#ifdef __OPENVDB__
+	float3 P = sd->P;
+	/* XXX OpenVDB does not support cubic interpolation - lukas_t */
+	float3 r = kernel_tex_voxel_float3(desc.offset, P.x, P.y, P.z, OPENVDB_SAMPLE_BOX);
 #else
-
-#if 1 /* XXX WITH_OPENVDB ? */
-	float3 P = ccl_fetch(sd, P);
-	r = kernel_tex_voxel_float3(desc.offset, P.x, P.y, P.z, OPENVDB_SAMPLE_POINT);
-#else
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
-	if(sd->flag & SD_VOLUME_CUBIC)
-		r = float4_to_float3(kernel_tex_image_interp_3d_ex(desc.offset, P.x, P.y, P.z, INTERPOLATION_CUBIC));
-	else
-		r = float4_to_float3(kernel_tex_image_interp_3d(desc.offset, P.x, P.y, P.z));
-#endif
-
+	float3 P = volume_normalized_position(kg, sd, sd->P);
+	InterpolationType interp = (sd->flag & SD_VOLUME_CUBIC)? INTERPOLATION_CUBIC: INTERPOLATION_NONE;
+	float3 r = float4_to_float3(kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z, interp));
 #endif
 
 	if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 9279a94c13a..84a988f1dbc 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -19,7 +19,8 @@
 
 /* CPU Kernel Interface */
 
-#include "util_types.h"
+#include "util/util_types.h"
+#include "kernel/kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -28,6 +29,7 @@ CCL_NAMESPACE_BEGIN
 #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
 
 struct KernelGlobals;
+struct KernelData;
 
 KernelGlobals *kernel_globals_create();
 void kernel_globals_free(KernelGlobals *kg);
@@ -46,32 +48,22 @@ void kernel_tex_copy(KernelGlobals *kg,
                      ExtensionType extension = EXTENSION_REPEAT);
 
 #define KERNEL_ARCH cpu
-#include "kernels/cpu/kernel_cpu.h"
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  define KERNEL_ARCH cpu_avx
-#  include "kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu.h"
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 6c3ee6b8098..366f25422fd 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -21,6 +21,9 @@ CCL_NAMESPACE_BEGIN
  * BSDF evaluation result, split per BSDF type. This is used to accumulate
  * render passes separately. */
 
+ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg,
+                                           const ShaderData *sd);
+
 ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 value, int use_light_pass)
 {
 #ifdef __PASSES__
@@ -52,10 +55,17 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
 	{
 		eval->diffuse = value;
 	}
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis = make_float3(0.0f, 0.0f, 0.0f);
+#endif
 }
 
-ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
+ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value, float mis_weight)
 {
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis += value;
+#endif
+	value *= mis_weight;
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
 		if(CLOSURE_IS_BSDF_DIFFUSE(type))
@@ -96,7 +106,7 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 	}
 }
 
-ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
+ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
@@ -115,8 +125,19 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
 	}
 }
 
+ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
+{
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis *= value;
+#endif
+	bsdf_eval_mis(eval, value);
+}
+
 ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
 {
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis *= value;
+#endif
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
 		eval->diffuse *= value;
@@ -134,7 +155,7 @@ ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
 #endif
 }
 
-ccl_device_inline float3 bsdf_eval_sum(BsdfEval *eval)
+ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
@@ -160,7 +181,6 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 
 	if(use_light_pass) {
 		L->indirect = make_float3(0.0f, 0.0f, 0.0f);
-		L->direct_throughput = make_float3(0.0f, 0.0f, 0.0f);
 		L->direct_emission = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->color_diffuse = make_float3(0.0f, 0.0f, 0.0f);
@@ -181,45 +201,78 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 		L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f);
 		L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
-		L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_glossy = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_transmission = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_scatter = make_float3(0.0f, 0.0f, 0.0f);
-
+		L->transparent = 0.0f;
 		L->emission = make_float3(0.0f, 0.0f, 0.0f);
 		L->background = make_float3(0.0f, 0.0f, 0.0f);
 		L->ao = make_float3(0.0f, 0.0f, 0.0f);
 		L->shadow = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 		L->mist = 0.0f;
+
+		L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.glossy = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.transmission = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.scatter = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.direct = make_float3(0.0f, 0.0f, 0.0f);
 	}
 	else
 #endif
 	{
+		L->transparent = 0.0f;
 		L->emission = make_float3(0.0f, 0.0f, 0.0f);
 	}
+
+#ifdef __SHADOW_TRICKS__
+	L->path_total = make_float3(0.0f, 0.0f, 0.0f);
+	L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
+	L->shadow_background_color = make_float3(0.0f, 0.0f, 0.0f);
+	L->shadow_throughput = 0.0f;
+	L->shadow_transparency = 1.0f;
+	L->has_shadow_catcher = 0;
+#endif
+
+#ifdef __DENOISING_FEATURES__
+	L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
+	L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
+	L->denoising_depth = 0.0f;
+#endif
+
+#ifdef __KERNEL_DEBUG__
+	L->debug_data.num_bvh_traversed_nodes = 0;
+	L->debug_data.num_bvh_traversed_instances = 0;
+	L->debug_data.num_bvh_intersections = 0;
+	L->debug_data.num_ray_bounces = 0;
+#endif
 }
 
-ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
-	BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label)
+ccl_device_inline void path_radiance_bsdf_bounce(
+	KernelGlobals *kg,
+	PathRadianceState *L_state,
+	ccl_addr_space float3 *throughput,
+	BsdfEval *bsdf_eval,
+	float bsdf_pdf, int bounce, int bsdf_label)
 {
 	float inverse_pdf = 1.0f/bsdf_pdf;
 
 #ifdef __PASSES__
-	if(L->use_light_pass) {
+	if(kernel_data.film.use_light_pass) {
 		if(bounce == 0 && !(bsdf_label & LABEL_TRANSPARENT)) {
 			/* first on directly visible surface */
 			float3 value = *throughput*inverse_pdf;
 
-			L->path_diffuse = bsdf_eval->diffuse*value;
-			L->path_glossy = bsdf_eval->glossy*value;
-			L->path_transmission = bsdf_eval->transmission*value;
-			L->path_subsurface = bsdf_eval->subsurface*value;
-			L->path_scatter = bsdf_eval->scatter*value;
-
-			*throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface + L->path_scatter;
+			L_state->diffuse = bsdf_eval->diffuse*value;
+			L_state->glossy = bsdf_eval->glossy*value;
+			L_state->transmission = bsdf_eval->transmission*value;
+			L_state->subsurface = bsdf_eval->subsurface*value;
+			L_state->scatter = bsdf_eval->scatter*value;
+
+			*throughput = L_state->diffuse +
+			              L_state->glossy +
+			              L_state->transmission +
+			              L_state->subsurface +
+			              L_state->scatter;
 			
-			L->direct_throughput = *throughput;
+			L_state->direct = *throughput;
 		}
 		else {
 			/* transparent bounce before first hit, or indirectly visible through BSDF */
@@ -234,13 +287,22 @@ ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space
 	}
 }
 
-ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 throughput, float3 value, int bounce)
+ccl_device_inline void path_radiance_accum_emission(PathRadiance *L,
+                                                    ccl_addr_space PathState *state,
+                                                    float3 throughput,
+                                                    float3 value)
 {
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		return;
+	}
+#endif
+
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0)
+		if(state->bounce == 0)
 			L->emission += throughput*value;
-		else if(bounce == 1)
+		else if(state->bounce == 1)
 			L->direct_emission += throughput*value;
 		else
 			L->indirect += throughput*value;
@@ -252,11 +314,28 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
 	}
 }
 
-ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput, float3 alpha, float3 bsdf, float3 ao, int bounce)
+ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
+                                              ccl_addr_space PathState *state,
+                                              float3 throughput,
+                                              float3 alpha,
+                                              float3 bsdf,
+                                              float3 ao)
 {
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		float3 light = throughput * bsdf;
+		L->path_total += light;
+		L->path_total_shaded += ao * light;
+
+		if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+			return;
+		}
+	}
+#endif
+
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0) {
+		if(state->bounce == 0) {
 			/* directly visible lighting */
 			L->direct_diffuse += throughput*bsdf*ao;
 			L->ao += alpha*throughput*ao;
@@ -273,11 +352,47 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput
 	}
 }
 
-ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
+ccl_device_inline void path_radiance_accum_total_ao(
+        PathRadiance *L,
+        ccl_addr_space PathState *state,
+        float3 throughput,
+        float3 bsdf)
+{
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * bsdf;
+	}
+#else
+	(void) L;
+	(void) state;
+	(void) throughput;
+	(void) bsdf;
+#endif
+}
+
+ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
+                                                 ccl_addr_space PathState *state,
+                                                 float3 throughput,
+                                                 BsdfEval *bsdf_eval,
+                                                 float3 shadow,
+                                                 float shadow_fac,
+                                                 bool is_lamp)
 {
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		float3 light = throughput * bsdf_eval->sum_no_mis;
+		L->path_total += light;
+		L->path_total_shaded += shadow * light;
+
+		if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+			return;
+		}
+	}
+#endif
+
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0) {
+		if(state->bounce == 0) {
 			/* directly visible lighting */
 			L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow;
 			L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
@@ -303,13 +418,47 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
 	}
 }
 
-ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 throughput, float3 value, int bounce)
+ccl_device_inline void path_radiance_accum_total_light(
+        PathRadiance *L,
+        ccl_addr_space PathState *state,
+        float3 throughput,
+        const BsdfEval *bsdf_eval)
 {
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * bsdf_eval->sum_no_mis;
+	}
+#else
+	(void) L;
+	(void) state;
+	(void) throughput;
+	(void) bsdf_eval;
+#endif
+}
+
+ccl_device_inline void path_radiance_accum_background(
+        PathRadiance *L,
+        ccl_addr_space PathState *state,
+        float3 throughput,
+        float3 value)
+{
+
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * value;
+		L->path_total_shaded += throughput * value * L->shadow_transparency;
+
+		if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+			return;
+		}
+	}
+#endif
+
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0)
+		if(state->bounce == 0)
 			L->background += throughput*value;
-		else if(bounce == 1)
+		else if(state->bounce == 1)
 			L->direct_emission += throughput*value;
 		else
 			L->indirect += throughput*value;
@@ -319,7 +468,31 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 th
 	{
 		L->emission += throughput*value;
 	}
+
+#ifdef __DENOISING_FEATURES__
+	L->denoising_albedo += state->denoising_feature_weight * value;
+#endif  /* __DENOISING_FEATURES__ */
+}
+
+ccl_device_inline void path_radiance_accum_transparent(
+        PathRadiance *L,
+        ccl_addr_space PathState *state,
+        float3 throughput)
+{
+	L->transparent += average(throughput);
+}
+
+#ifdef __SHADOW_TRICKS__
+ccl_device_inline void path_radiance_accum_shadowcatcher(
+        PathRadiance *L,
+        float3 throughput,
+        float3 background)
+{
+	L->shadow_throughput += average(throughput);
+	L->shadow_background_color += throughput * background;
+	L->has_shadow_catcher = 1;
 }
+#endif
 
 ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
 {
@@ -328,19 +501,19 @@ ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
 	 * only a single throughput further along the path, here we recover just
 	 * the indirect path that is not influenced by any particular BSDF type */
 	if(L->use_light_pass) {
-		L->direct_emission = safe_divide_color(L->direct_emission, L->direct_throughput);
-		L->direct_diffuse += L->path_diffuse*L->direct_emission;
-		L->direct_glossy += L->path_glossy*L->direct_emission;
-		L->direct_transmission += L->path_transmission*L->direct_emission;
-		L->direct_subsurface += L->path_subsurface*L->direct_emission;
-		L->direct_scatter += L->path_scatter*L->direct_emission;
-
-		L->indirect = safe_divide_color(L->indirect, L->direct_throughput);
-		L->indirect_diffuse += L->path_diffuse*L->indirect;
-		L->indirect_glossy += L->path_glossy*L->indirect;
-		L->indirect_transmission += L->path_transmission*L->indirect;
-		L->indirect_subsurface += L->path_subsurface*L->indirect;
-		L->indirect_scatter += L->path_scatter*L->indirect;
+		L->direct_emission = safe_divide_color(L->direct_emission, L->state.direct);
+		L->direct_diffuse += L->state.diffuse*L->direct_emission;
+		L->direct_glossy += L->state.glossy*L->direct_emission;
+		L->direct_transmission += L->state.transmission*L->direct_emission;
+		L->direct_subsurface += L->state.subsurface*L->direct_emission;
+		L->direct_scatter += L->state.scatter*L->direct_emission;
+
+		L->indirect = safe_divide_color(L->indirect, L->state.direct);
+		L->indirect_diffuse += L->state.diffuse*L->indirect;
+		L->indirect_glossy += L->state.glossy*L->indirect;
+		L->indirect_transmission += L->state.transmission*L->indirect;
+		L->indirect_subsurface += L->state.subsurface*L->indirect;
+		L->indirect_scatter += L->state.scatter*L->indirect;
 	}
 #endif
 }
@@ -349,11 +522,11 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_glossy = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_transmission = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_scatter = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.glossy = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.transmission = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->direct_emission = make_float3(0.0f, 0.0f, 0.0f);
 		L->indirect = make_float3(0.0f, 0.0f, 0.0f);
@@ -366,11 +539,7 @@ ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L,
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		L->path_diffuse = L_src->path_diffuse;
-		L->path_glossy = L_src->path_glossy;
-		L->path_transmission = L_src->path_transmission;
-		L->path_subsurface = L_src->path_subsurface;
-		L->path_scatter = L_src->path_scatter;
+		L->state = L_src->state;
 
 		L->direct_emission = L_src->direct_emission;
 		L->indirect = L_src->indirect;
@@ -378,7 +547,40 @@ ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L,
 #endif
 }
 
-ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L)
+#ifdef __SHADOW_TRICKS__
+ccl_device_inline void path_radiance_sum_shadowcatcher(KernelGlobals *kg,
+                                                       PathRadiance *L,
+                                                       float3 *L_sum,
+                                                       float *alpha)
+{
+	/* Calculate current shadow of the path. */
+	float path_total = average(L->path_total);
+	float shadow;
+
+	if(UNLIKELY(!isfinite_safe(path_total))) {
+		kernel_assert(!"Non-finite total radiance along the path");
+		shadow = 0.0f;
+	}
+	else if(path_total == 0.0f) {
+		shadow = L->shadow_transparency;
+	}
+	else {
+		float path_total_shaded = average(L->path_total_shaded);
+		shadow = path_total_shaded / path_total;
+	}
+
+	/* Calculate final light sum and transparency for shadow catcher object. */
+	if(kernel_data.background.transparent) {
+		*alpha -= L->shadow_throughput * shadow;
+	}
+	else {
+		L->shadow_background_color *= shadow;
+		*L_sum += L->shadow_background_color;
+	}
+}
+#endif
+
+ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L, float *alpha)
 {
 	float3 L_sum;
 	/* Light Passes are used */
@@ -399,7 +601,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 		float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
 
 		/* Reject invalid value */
-		if(!isfinite(sum)) {
+		if(!isfinite_safe(sum)) {
 			kernel_assert(!"Non-finite sum in path_radiance_clamp_and_sum!");
 			L_sum = make_float3(0.0f, 0.0f, 0.0f);
 
@@ -455,8 +657,6 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 			L_sum = L_direct + L_indirect;
 		}
 #endif
-
-		return L_sum;
 	}
 
 	/* No Light Passes */
@@ -464,42 +664,105 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 #endif
 	{
 		L_sum = L->emission;
+
+		/* Reject invalid value */
+		float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
+		if(!isfinite_safe(sum)) {
+			kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
+			L_sum = make_float3(0.0f, 0.0f, 0.0f);
+		}
 	}
 
-	/* Reject invalid value */
-	float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
-	if(!isfinite(sum)) {
-		kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
-		L_sum = make_float3(0.0f, 0.0f, 0.0f);
+	/* Compute alpha. */
+	*alpha = 1.0f - L->transparent;
+
+	/* Add shadow catcher contributions. */
+#ifdef __SHADOW_TRICKS__
+	if(L->has_shadow_catcher) {
+		path_radiance_sum_shadowcatcher(kg, L, &L_sum, alpha);
 	}
+#endif  /* __SHADOW_TRICKS__ */
 
 	return L_sum;
 }
 
-ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
+ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean)
+{
+#ifdef __PASSES__
+	kernel_assert(L->use_light_pass);
+
+	*clean = L->emission + L->background;
+	*noisy = L->direct_scatter + L->indirect_scatter;
+
+#  define ADD_COMPONENT(flag, component)     \
+	if(kernel_data.film.denoising_flags & flag) \
+		*clean += component;                 \
+	else                                     \
+		*noisy += component;
+
+	ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR,      L->direct_diffuse);
+	ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND,      L->indirect_diffuse);
+	ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR,       L->direct_glossy);
+	ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND,       L->indirect_glossy);
+	ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
+	ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
+	ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR,   L->direct_subsurface);
+	ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND,   L->indirect_subsurface);
+#  undef ADD_COMPONENT
+#else
+	*noisy = L->emission;
+	*clean = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+#ifdef __SHADOW_TRICKS__
+	if(L->has_shadow_catcher) {
+		*noisy += L->shadow_background_color;
+	}
+#endif
+
+	*noisy = ensure_finite3(*noisy);
+	*clean = ensure_finite3(*clean);
+}
+
+ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample)
 {
-	float fac = 1.0f/num_samples;
+#ifdef __SPLIT_KERNEL__
+#  define safe_float3_add(f, v) \
+	do { \
+		ccl_global float *p = (ccl_global float*)(&(f)); \
+		atomic_add_and_fetch_float(p+0, (v).x); \
+		atomic_add_and_fetch_float(p+1, (v).y); \
+		atomic_add_and_fetch_float(p+2, (v).z); \
+	} while(0)
+#  define safe_float_add(f, v) \
+		atomic_add_and_fetch_float(&(f), (v))
+#else
+#  define safe_float3_add(f, v) (f) += (v)
+#  define safe_float_add(f, v) (f) += (v)
+#endif  /* __SPLIT_KERNEL__ */
 
 #ifdef __PASSES__
-	L->direct_diffuse += L_sample->direct_diffuse*fac;
-	L->direct_glossy += L_sample->direct_glossy*fac;
-	L->direct_transmission += L_sample->direct_transmission*fac;
-	L->direct_subsurface += L_sample->direct_subsurface*fac;
-	L->direct_scatter += L_sample->direct_scatter*fac;
-
-	L->indirect_diffuse += L_sample->indirect_diffuse*fac;
-	L->indirect_glossy += L_sample->indirect_glossy*fac;
-	L->indirect_transmission += L_sample->indirect_transmission*fac;
-	L->indirect_subsurface += L_sample->indirect_subsurface*fac;
-	L->indirect_scatter += L_sample->indirect_scatter*fac;
-
-	L->background += L_sample->background*fac;
-	L->ao += L_sample->ao*fac;
-	L->shadow += L_sample->shadow*fac;
-	L->mist += L_sample->mist*fac;
-#endif
-	L->emission += L_sample->emission * fac;
+	safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse);
+	safe_float3_add(L->direct_glossy, L_sample->direct_glossy);
+	safe_float3_add(L->direct_transmission, L_sample->direct_transmission);
+	safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface);
+	safe_float3_add(L->direct_scatter, L_sample->direct_scatter);
+
+	safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse);
+	safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy);
+	safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission);
+	safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface);
+	safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter);
+
+	safe_float3_add(L->background, L_sample->background);
+	safe_float3_add(L->ao, L_sample->ao);
+	safe_float3_add(L->shadow, L_sample->shadow);
+	safe_float_add(L->mist, L_sample->mist);
+#endif  /* __PASSES__ */
+	safe_float3_add(L->emission, L_sample->emission);
+
+#undef safe_float_add
+#undef safe_float3_add
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index c32ac6ccf41..84d8d84d486 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void compute_light_pass(KernelGlobals *kg,
                                           ShaderData *sd,
                                           PathRadiance *L,
-                                          RNG rng,
+                                          uint rng_hash,
                                           int pass_filter,
                                           int sample)
 {
@@ -48,13 +48,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 	path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
 
 	/* init path state */
-	path_state_init(kg, &emission_sd, &state, &rng, sample, NULL);
+	path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL);
 
 	/* evaluate surface shader */
-	float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
-	shader_eval_surface(kg, sd, &rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
+	shader_eval_surface(kg, sd, &state, state.flag);
 
-	/* TODO, disable the closures we won't need */
+	/* TODO, disable more closures we don't need besides transparent */
+	shader_bsdf_disable_transparency(kg, sd);
 
 #ifdef __BRANCHED_PATH__
 	if(!kernel_data.integrator.branched) {
@@ -63,13 +63,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 
 		/* sample ambient occlusion */
 		if(pass_filter & BAKE_FILTER_AO) {
-			kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput, shader_bsdf_alpha(kg, sd));
+			kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput, shader_bsdf_alpha(kg, sd));
 		}
 
 		/* sample emission */
 		if((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
 			float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+			path_radiance_accum_emission(&L_sample, &state, throughput, emission);
 		}
 
 		bool is_sss_sample = false;
@@ -85,7 +85,6 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 			                                  &emission_sd,
 			                                  &L_sample,
 			                                  &state,
-			                                  &rng,
 			                                  &ray,
 			                                  &throughput,
 			                                  &ss_indirect))
@@ -100,13 +99,10 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 					kernel_path_indirect(kg,
 					                     &indirect_sd,
 					                     &emission_sd,
-					                     &rng,
 					                     &ray,
 					                     throughput,
-					                     state.num_samples,
 					                     &state,
 					                     &L_sample);
-					kernel_path_subsurface_accum_indirect(&ss_indirect, &L_sample);
 				}
 				is_sss_sample = true;
 			}
@@ -115,14 +111,14 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 
 		/* sample light and BSDF */
 		if(!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) {
-			kernel_path_surface_connect_light(kg, &rng, sd, &emission_sd, throughput, &state, &L_sample);
+			kernel_path_surface_connect_light(kg, sd, &emission_sd, throughput, &state, &L_sample);
 
-			if(kernel_path_surface_bounce(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
+			if(kernel_path_surface_bounce(kg, sd, &throughput, &state, &L_sample.state, &ray)) {
 #ifdef __LAMP_MIS__
 				state.ray_t = 0.0f;
 #endif
 				/* compute indirect light */
-				kernel_path_indirect(kg, &indirect_sd, &emission_sd, &rng, &ray, throughput, 1, &state, &L_sample);
+				kernel_path_indirect(kg, &indirect_sd, &emission_sd, &ray, throughput, &state, &L_sample);
 
 				/* sum and reset indirect light pass variables for the next samples */
 				path_radiance_sum_indirect(&L_sample);
@@ -136,13 +132,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 
 		/* sample ambient occlusion */
 		if(pass_filter & BAKE_FILTER_AO) {
-			kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput);
+			kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput);
 		}
 
 		/* sample emission */
 		if((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
 			float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+			path_radiance_accum_emission(&L_sample, &state, throughput, emission);
 		}
 
 #ifdef __SUBSURFACE__
@@ -150,7 +146,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 		if((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) {
 			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
 			kernel_branched_path_subsurface_scatter(kg, sd, &indirect_sd,
-				&emission_sd, &L_sample, &state, &rng, &ray, throughput);
+				&emission_sd, &L_sample, &state, &ray, throughput);
 		}
 #endif
 
@@ -160,20 +156,20 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 			/* direct light */
 			if(kernel_data.integrator.use_direct_light) {
 				int all = kernel_data.integrator.sample_all_lights_direct;
-				kernel_branched_path_surface_connect_light(kg, &rng,
+				kernel_branched_path_surface_connect_light(kg,
 					sd, &emission_sd, &state, throughput, 1.0f, &L_sample, all);
 			}
 #endif
 
 			/* indirect light */
-			kernel_branched_path_surface_indirect_light(kg, &rng,
+			kernel_branched_path_surface_indirect_light(kg,
 				sd, &indirect_sd, &emission_sd, throughput, 1.0f, &state, &L_sample);
 		}
 	}
 #endif
 
 	/* accumulate into master L */
-	path_radiance_accum_sample(L, &L_sample, 1);
+	path_radiance_accum_sample(L, &L_sample);
 }
 
 ccl_device bool is_aa_pass(ShaderEvalType type)
@@ -224,7 +220,6 @@ ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg,
 
 ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
                                                        ShaderData *sd,
-                                                       RNG *rng,
                                                        PathState *state,
                                                        float3 direct,
                                                        float3 indirect,
@@ -244,12 +239,12 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
 		}
 		else {
 			/* surface color of the pass only */
-			shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN);
+			shader_eval_surface(kg, sd, state, 0);
 			return kernel_bake_shader_bsdf(kg, sd, type);
 		}
 	}
 	else {
-		shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN);
+		shader_eval_surface(kg, sd, state, 0);
 		color = kernel_bake_shader_bsdf(kg, sd, type);
 	}
 
@@ -291,14 +286,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	int num_samples = kernel_data.integrator.aa_samples;
 
 	/* random number generator */
-	RNG rng = cmj_hash(offset + i, kernel_data.integrator.seed);
+	uint rng_hash = cmj_hash(offset + i, kernel_data.integrator.seed);
 
 	float filter_x, filter_y;
 	if(sample == 0) {
 		filter_x = filter_y = 0.5f;
 	}
 	else {
-		path_rng_2D(kg, &rng, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
+		path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
 	}
 
 	/* subpixel u/v offset */
@@ -320,7 +315,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	                         P, Ng, Ng,
 	                         shader, object, prim,
 	                         u, v, 1.0f, 0.5f,
-	                         !(kernel_tex_fetch(__object_flag, object) & SD_TRANSFORM_APPLIED),
+	                         !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
 	                         LAMP_NONE);
 	sd.I = sd.N;
 
@@ -334,18 +329,20 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 
 	/* light passes if we need more than color */
 	if(pass_filter & ~BAKE_FILTER_COLOR)
-		compute_light_pass(kg, &sd, &L, rng, pass_filter, sample);
+		compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample);
 
 	switch(type) {
 		/* data passes */
 		case SHADER_EVAL_NORMAL:
 		{
+			float3 N = sd.N;
 			if((sd.flag & SD_HAS_BUMP)) {
-				shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_MAIN);
+				shader_eval_surface(kg, &sd, &state, 0);
+				N = shader_bsdf_average_normal(kg, &sd);
 			}
 
-			/* compression: normal = (2 * color) - 1 */
-			out = sd.N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
+			/* encoding: normal = (2 * color) - 1 */
+			out = N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
 			break;
 		}
 		case SHADER_EVAL_UV:
@@ -355,7 +352,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		}
 		case SHADER_EVAL_EMISSION:
 		{
-			shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_EMISSION);
+			shader_eval_surface(kg, &sd, &state, 0);
 			out = shader_emissive_eval(kg, &sd);
 			break;
 		}
@@ -370,7 +367,8 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		case SHADER_EVAL_COMBINED:
 		{
 			if((pass_filter & BAKE_FILTER_COMBINED) == BAKE_FILTER_COMBINED) {
-				out = path_radiance_clamp_and_sum(kg, &L);
+				float alpha;
+				out = path_radiance_clamp_and_sum(kg, &L, &alpha);
 				break;
 			}
 
@@ -408,7 +406,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		{
 			out = kernel_bake_evaluate_direct_indirect(kg,
 			                                           &sd,
-			                                           &rng,
 			                                           &state,
 			                                           L.direct_diffuse,
 			                                           L.indirect_diffuse,
@@ -420,7 +417,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		{
 			out = kernel_bake_evaluate_direct_indirect(kg,
 			                                           &sd,
-			                                           &rng,
 			                                           &state,
 			                                           L.direct_glossy,
 			                                           L.indirect_glossy,
@@ -432,7 +428,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		{
 			out = kernel_bake_evaluate_direct_indirect(kg,
 			                                           &sd,
-			                                           &rng,
 			                                           &state,
 			                                           L.direct_transmission,
 			                                           L.indirect_transmission,
@@ -445,7 +440,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 #ifdef __SUBSURFACE__
 			out = kernel_bake_evaluate_direct_indirect(kg,
 			                                           &sd,
-			                                           &rng,
 			                                           &state,
 			                                           L.direct_subsurface,
 			                                           L.indirect_subsurface,
@@ -479,7 +473,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 
 			/* evaluate */
 			int flag = 0; /* we can't know which type of BSDF this is for */
-			out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN);
+			out = shader_eval_background(kg, &sd, &state, flag);
 			break;
 		}
 		default:
@@ -499,78 +493,69 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 
 #endif  /* __BAKING__ */
 
-ccl_device void kernel_shader_evaluate(KernelGlobals *kg,
-                                       ccl_global uint4 *input,
-                                       ccl_global float4 *output,
-                                       ccl_global float *output_luma,
-                                       ShaderEvalType type,
-                                       int i,
-                                       int sample)
+ccl_device void kernel_displace_evaluate(KernelGlobals *kg,
+                                         ccl_global uint4 *input,
+                                         ccl_global float4 *output,
+                                         int i)
 {
 	ShaderData sd;
 	PathState state = {0};
 	uint4 in = input[i];
-	float3 out;
 
-	if(type == SHADER_EVAL_DISPLACE) {
-		/* setup shader data */
-		int object = in.x;
-		int prim = in.y;
-		float u = __uint_as_float(in.z);
-		float v = __uint_as_float(in.w);
+	/* setup shader data */
+	int object = in.x;
+	int prim = in.y;
+	float u = __uint_as_float(in.z);
+	float v = __uint_as_float(in.w);
 
-		shader_setup_from_displace(kg, &sd, object, prim, u, v);
+	shader_setup_from_displace(kg, &sd, object, prim, u, v);
 
-		/* evaluate */
-		float3 P = sd.P;
-		shader_eval_displacement(kg, &sd, &state, SHADER_CONTEXT_MAIN);
-		out = sd.P - P;
+	/* evaluate */
+	float3 P = sd.P;
+	shader_eval_displacement(kg, &sd, &state);
+	float3 D = sd.P - P;
 
-		object_inverse_dir_transform(kg, &sd, &out);
-	}
-	else { // SHADER_EVAL_BACKGROUND
-		/* setup ray */
-		Ray ray;
-		float u = __uint_as_float(in.x);
-		float v = __uint_as_float(in.y);
-
-		ray.P = make_float3(0.0f, 0.0f, 0.0f);
-		ray.D = equirectangular_to_direction(u, v);
-		ray.t = 0.0f;
+	object_inverse_dir_transform(kg, &sd, &D);
+
+	/* write output */
+	output[i] += make_float4(D.x, D.y, D.z, 0.0f);
+}
+
+ccl_device void kernel_background_evaluate(KernelGlobals *kg,
+                                           ccl_global uint4 *input,
+                                           ccl_global float4 *output,
+                                           int i)
+{
+	ShaderData sd;
+	PathState state = {0};
+	uint4 in = input[i];
+
+	/* setup ray */
+	Ray ray;
+	float u = __uint_as_float(in.x);
+	float v = __uint_as_float(in.y);
+
+	ray.P = make_float3(0.0f, 0.0f, 0.0f);
+	ray.D = equirectangular_to_direction(u, v);
+	ray.t = 0.0f;
 #ifdef __CAMERA_MOTION__
-		ray.time = 0.5f;
+	ray.time = 0.5f;
 #endif
 
 #ifdef __RAY_DIFFERENTIALS__
-		ray.dD = differential3_zero();
-		ray.dP = differential3_zero();
+	ray.dD = differential3_zero();
+	ray.dP = differential3_zero();
 #endif
 
-		/* setup shader data */
-		shader_setup_from_background(kg, &sd, &ray);
+	/* setup shader data */
+	shader_setup_from_background(kg, &sd, &ray);
+
+	/* evaluate */
+	int flag = 0; /* we can't know which type of BSDF this is for */
+	float3 color = shader_eval_background(kg, &sd, &state, flag);
 
-		/* evaluate */
-		int flag = 0; /* we can't know which type of BSDF this is for */
-		out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN);
-	}
-	
 	/* write output */
-	if(sample == 0) {
-		if(output != NULL) {
-			output[i] = make_float4(out.x, out.y, out.z, 0.0f);
-		}
-		if(output_luma != NULL) {
-			output_luma[i] = average(out);
-		}
-	}
-	else {
-		if(output != NULL) {
-			output[i] += make_float4(out.x, out.y, out.z, 0.0f);
-		}
-		if(output_luma != NULL) {
-			output_luma[i] += average(out);
-		}
-	}
+	output[i] += make_float4(color.x, color.y, color.z, 0.0f);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index dedac6b1465..0df5217d97a 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -457,7 +457,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 {
 	if(kernel_data.cam.type != CAMERA_PANORAMA) {
 		/* perspective / ortho */
-		if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
+		if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
 			P += camera_position(kg);
 
 		Transform tfm = kernel_data.cam.worldtondc;
@@ -467,7 +467,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 		/* panorama */
 		Transform tfm = kernel_data.cam.worldtocamera;
 
-		if(ccl_fetch(sd, object) != OBJECT_NONE)
+		if(sd->object != OBJECT_NONE)
 			P = normalize(transform_point(&tfm, P));
 		else
 			P = normalize(transform_direction(&tfm, P));
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index dfcfcba2a40..4b43209e4aa 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -35,15 +35,24 @@
 #  define __NODES_FEATURES__ NODE_FEATURE_ALL
 #endif
 
-#include "util_debug.h"
-#include "util_math.h"
-#include "util_simd.h"
-#include "util_half.h"
-#include "util_types.h"
-#include "util_texture.h"
+#include "util/util_debug.h"
+#include "util/util_math.h"
+#include "util/util_simd.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
+#include "util/util_texture.h"
 
 #define ccl_addr_space
 
+#define ccl_local_id(d) 0
+#define ccl_global_id(d) (kg->global_id[d])
+
+#define ccl_local_size(d) 1
+#define ccl_global_size(d) (kg->global_size[d])
+
+#define ccl_group_id(d) ccl_global_id(d)
+#define ccl_num_groups(d) ccl_global_size(d)
+
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
  */
@@ -65,7 +74,7 @@ CCL_NAMESPACE_BEGIN
  * pointer lookup. */
 
 template<typename T> struct texture  {
-	ccl_always_inline T fetch(int index)
+	ccl_always_inline const T& fetch(int index)
 	{
 		kernel_assert(index >= 0 && index < width);
 		return data[index];
@@ -78,9 +87,9 @@ template<typename T> struct texture  {
 	ccl_always_inline avxf fetch_avxf(const int index)
 	{
 		kernel_assert(index >= 0 && (index+1) < width);
-		ssef *ssefData = (ssef*)data;
-		ssef *ssefNodeData = &ssefData[index];
-		return _mm256_loadu_ps((float *)ssefNodeData);
+		ssef *ssef_data = (ssef*)data;
+		ssef *ssef_node_data = &ssef_data[index];
+		return _mm256_loadu_ps((float *)ssef_node_data);
 	}
 
 #endif
@@ -103,420 +112,6 @@ template<typename T> struct texture  {
 	int width;
 };
 
-template<typename T> struct texture_image  {
-#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
-	{ \
-		u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
-		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
-		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
-		u[3] = (1.0f / 6.0f) * t * t * t; \
-	} (void)0
-
-	ccl_always_inline float4 read(float4 r)
-	{
-		return r;
-	}
-
-	ccl_always_inline float4 read(uchar4 r)
-	{
-		float f = 1.0f/255.0f;
-		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
-	}
-
-	ccl_always_inline float4 read(uchar r)
-	{
-		float f = r*(1.0f/255.0f);
-		return make_float4(f, f, f, 1.0f);
-	}
-
-	ccl_always_inline float4 read(float r)
-	{
-		/* TODO(dingto): Optimize this, so interpolation
-		 * happens on float instead of float4 */
-		return make_float4(r, r, r, 1.0f);
-	}
-
-	ccl_always_inline float4 read(half4 r)
-	{
-		return half4_to_float4(r);
-	}
-
-	ccl_always_inline float4 read(half r)
-	{
-		float f = half_to_float(r);
-		return make_float4(f, f, f, 1.0f);
-	}
-
-	ccl_always_inline int wrap_periodic(int x, int width)
-	{
-		x %= width;
-		if(x < 0)
-			x += width;
-		return x;
-	}
-
-	ccl_always_inline int wrap_clamp(int x, int width)
-	{
-		return clamp(x, 0, width-1);
-	}
-
-	ccl_always_inline float frac(float x, int *ix)
-	{
-		int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
-		*ix = i;
-		return x - (float)i;
-	}
-
-	ccl_always_inline float4 interp(float x, float y)
-	{
-		if(UNLIKELY(!data))
-			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-		int ix, iy, nix, niy;
-
-		if(interpolation == INTERPOLATION_CLOSEST) {
-			frac(x*(float)width, &ix);
-			frac(y*(float)height, &iy);
-			switch(extension) {
-				case EXTENSION_REPEAT:
-					ix = wrap_periodic(ix, width);
-					iy = wrap_periodic(iy, height);
-					break;
-				case EXTENSION_CLIP:
-					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-					}
-					/* Fall through. */
-				case EXTENSION_EXTEND:
-					ix = wrap_clamp(ix, width);
-					iy = wrap_clamp(iy, height);
-					break;
-				default:
-					kernel_assert(0);
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			}
-			return read(data[ix + iy*width]);
-		}
-		else if(interpolation == INTERPOLATION_LINEAR) {
-			float tx = frac(x*(float)width - 0.5f, &ix);
-			float ty = frac(y*(float)height - 0.5f, &iy);
-
-			switch(extension) {
-				case EXTENSION_REPEAT:
-					ix = wrap_periodic(ix, width);
-					iy = wrap_periodic(iy, height);
-
-					nix = wrap_periodic(ix+1, width);
-					niy = wrap_periodic(iy+1, height);
-					break;
-				case EXTENSION_CLIP:
-					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-					}
-					/* Fall through. */
-				case EXTENSION_EXTEND:
-					nix = wrap_clamp(ix+1, width);
-					niy = wrap_clamp(iy+1, height);
-
-					ix = wrap_clamp(ix, width);
-					iy = wrap_clamp(iy, height);
-					break;
-				default:
-					kernel_assert(0);
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			}
-
-			float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
-			r += (1.0f - ty)*tx*read(data[nix + iy*width]);
-			r += ty*(1.0f - tx)*read(data[ix + niy*width]);
-			r += ty*tx*read(data[nix + niy*width]);
-
-			return r;
-		}
-		else {
-			/* Bicubic b-spline interpolation. */
-			float tx = frac(x*(float)width - 0.5f, &ix);
-			float ty = frac(y*(float)height - 0.5f, &iy);
-			int pix, piy, nnix, nniy;
-			switch(extension) {
-				case EXTENSION_REPEAT:
-					ix = wrap_periodic(ix, width);
-					iy = wrap_periodic(iy, height);
-
-					pix = wrap_periodic(ix-1, width);
-					piy = wrap_periodic(iy-1, height);
-
-					nix = wrap_periodic(ix+1, width);
-					niy = wrap_periodic(iy+1, height);
-
-					nnix = wrap_periodic(ix+2, width);
-					nniy = wrap_periodic(iy+2, height);
-					break;
-				case EXTENSION_CLIP:
-					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-					}
-					/* Fall through. */
-				case EXTENSION_EXTEND:
-					pix = wrap_clamp(ix-1, width);
-					piy = wrap_clamp(iy-1, height);
-
-					nix = wrap_clamp(ix+1, width);
-					niy = wrap_clamp(iy+1, height);
-
-					nnix = wrap_clamp(ix+2, width);
-					nniy = wrap_clamp(iy+2, height);
-
-					ix = wrap_clamp(ix, width);
-					iy = wrap_clamp(iy, height);
-					break;
-				default:
-					kernel_assert(0);
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			}
-
-			const int xc[4] = {pix, ix, nix, nnix};
-			const int yc[4] = {width * piy,
-			                   width * iy,
-			                   width * niy,
-			                   width * nniy};
-			float u[4], v[4];
-			/* Some helper macro to keep code reasonable size,
-			 * let compiler to inline all the matrix multiplications.
-			 */
-#define DATA(x, y) (read(data[xc[x] + yc[y]]))
-#define TERM(col) \
-			(v[col] * (u[0] * DATA(0, col) + \
-			           u[1] * DATA(1, col) + \
-			           u[2] * DATA(2, col) + \
-			           u[3] * DATA(3, col)))
-
-			SET_CUBIC_SPLINE_WEIGHTS(u, tx);
-			SET_CUBIC_SPLINE_WEIGHTS(v, ty);
-
-			/* Actual interpolation. */
-			return TERM(0) + TERM(1) + TERM(2) + TERM(3);
-
-#undef TERM
-#undef DATA
-		}
-	}
-
-	ccl_always_inline float4 interp_3d(float x, float y, float z)
-	{
-		return interp_3d_ex(x, y, z, interpolation);
-	}
-
-	ccl_always_inline float4 interp_3d_ex(float x, float y, float z,
-	                                      int interpolation = INTERPOLATION_LINEAR)
-	{
-		if(UNLIKELY(!data))
-			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-		int ix, iy, iz, nix, niy, niz;
-
-		if(interpolation == INTERPOLATION_CLOSEST) {
-			frac(x*(float)width, &ix);
-			frac(y*(float)height, &iy);
-			frac(z*(float)depth, &iz);
-
-			switch(extension) {
-				case EXTENSION_REPEAT:
-					ix = wrap_periodic(ix, width);
-					iy = wrap_periodic(iy, height);
-					iz = wrap_periodic(iz, depth);
-					break;
-				case EXTENSION_CLIP:
-					if(x < 0.0f || y < 0.0f || z < 0.0f ||
-					   x > 1.0f || y > 1.0f || z > 1.0f)
-					{
-						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-					}
-					/* Fall through. */
-				case EXTENSION_EXTEND:
-					ix = wrap_clamp(ix, width);
-					iy = wrap_clamp(iy, height);
-					iz = wrap_clamp(iz, depth);
-					break;
-				default:
-					kernel_assert(0);
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			}
-
-			return read(data[ix + iy*width + iz*width*height]);
-		}
-		else if(interpolation == INTERPOLATION_LINEAR) {
-			float tx = frac(x*(float)width - 0.5f, &ix);
-			float ty = frac(y*(float)height - 0.5f, &iy);
-			float tz = frac(z*(float)depth - 0.5f, &iz);
-
-			switch(extension) {
-				case EXTENSION_REPEAT:
-					ix = wrap_periodic(ix, width);
-					iy = wrap_periodic(iy, height);
-					iz = wrap_periodic(iz, depth);
-
-					nix = wrap_periodic(ix+1, width);
-					niy = wrap_periodic(iy+1, height);
-					niz = wrap_periodic(iz+1, depth);
-					break;
-				case EXTENSION_CLIP:
-					if(x < 0.0f || y < 0.0f || z < 0.0f ||
-					   x > 1.0f || y > 1.0f || z > 1.0f)
-					{
-						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-					}
-					/* Fall through. */
-				case EXTENSION_EXTEND:
-					nix = wrap_clamp(ix+1, width);
-					niy = wrap_clamp(iy+1, height);
-					niz = wrap_clamp(iz+1, depth);
-
-					ix = wrap_clamp(ix, width);
-					iy = wrap_clamp(iy, height);
-					iz = wrap_clamp(iz, depth);
-					break;
-				default:
-					kernel_assert(0);
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			}
-
-			float4 r;
-
-			r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
-			r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
-			r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
-			r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
-
-			r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
-			r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
-			r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
-			r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
-
-			return r;
-		}
-		else {
-			/* Tricubic b-spline interpolation. */
-			const float tx = frac(x*(float)width - 0.5f, &ix);
-			const float ty = frac(y*(float)height - 0.5f, &iy);
-			const float tz = frac(z*(float)depth - 0.5f, &iz);
-			int pix, piy, piz, nnix, nniy, nniz;
-
-			switch(extension) {
-				case EXTENSION_REPEAT:
-					ix = wrap_periodic(ix, width);
-					iy = wrap_periodic(iy, height);
-					iz = wrap_periodic(iz, depth);
-
-					pix = wrap_periodic(ix-1, width);
-					piy = wrap_periodic(iy-1, height);
-					piz = wrap_periodic(iz-1, depth);
-
-					nix = wrap_periodic(ix+1, width);
-					niy = wrap_periodic(iy+1, height);
-					niz = wrap_periodic(iz+1, depth);
-
-					nnix = wrap_periodic(ix+2, width);
-					nniy = wrap_periodic(iy+2, height);
-					nniz = wrap_periodic(iz+2, depth);
-					break;
-				case EXTENSION_CLIP:
-					if(x < 0.0f || y < 0.0f || z < 0.0f ||
-					   x > 1.0f || y > 1.0f || z > 1.0f)
-					{
-						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-					}
-					/* Fall through. */
-				case EXTENSION_EXTEND:
-					pix = wrap_clamp(ix-1, width);
-					piy = wrap_clamp(iy-1, height);
-					piz = wrap_clamp(iz-1, depth);
-
-					nix = wrap_clamp(ix+1, width);
-					niy = wrap_clamp(iy+1, height);
-					niz = wrap_clamp(iz+1, depth);
-
-					nnix = wrap_clamp(ix+2, width);
-					nniy = wrap_clamp(iy+2, height);
-					nniz = wrap_clamp(iz+2, depth);
-
-					ix = wrap_clamp(ix, width);
-					iy = wrap_clamp(iy, height);
-					iz = wrap_clamp(iz, depth);
-					break;
-				default:
-					kernel_assert(0);
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			}
-
-			const int xc[4] = {pix, ix, nix, nnix};
-			const int yc[4] = {width * piy,
-			                   width * iy,
-			                   width * niy,
-			                   width * nniy};
-			const int zc[4] = {width * height * piz,
-			                   width * height * iz,
-			                   width * height * niz,
-			                   width * height * nniz};
-			float u[4], v[4], w[4];
-
-			/* Some helper macro to keep code reasonable size,
-			 * let compiler to inline all the matrix multiplications.
-			 */
-#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
-#define COL_TERM(col, row) \
-			(v[col] * (u[0] * DATA(0, col, row) + \
-			           u[1] * DATA(1, col, row) + \
-			           u[2] * DATA(2, col, row) + \
-			           u[3] * DATA(3, col, row)))
-#define ROW_TERM(row) \
-			(w[row] * (COL_TERM(0, row) + \
-			           COL_TERM(1, row) + \
-			           COL_TERM(2, row) + \
-			           COL_TERM(3, row)))
-
-			SET_CUBIC_SPLINE_WEIGHTS(u, tx);
-			SET_CUBIC_SPLINE_WEIGHTS(v, ty);
-			SET_CUBIC_SPLINE_WEIGHTS(w, tz);
-
-			/* Actual interpolation. */
-			return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
-
-#undef COL_TERM
-#undef ROW_TERM
-#undef DATA
-		}
-	}
-
-	ccl_always_inline void dimensions_set(int width_, int height_, int depth_)
-	{
-		width = width_;
-		height = height_;
-		depth = depth_;
-	}
-
-	T *data;
-	int interpolation;
-	ExtensionType extension;
-	int width, height, depth;
-#undef SET_CUBIC_SPLINE_WEIGHTS
-};
-
-typedef texture<float4> texture_float4;
-typedef texture<float2> texture_float2;
-typedef texture<float> texture_float;
-typedef texture<uint> texture_uint;
-typedef texture<int> texture_int;
-typedef texture<uint4> texture_uint4;
-typedef texture<uchar4> texture_uchar4;
-typedef texture<uchar> texture_uchar;
-typedef texture_image<float> texture_image_float;
-typedef texture_image<uchar> texture_image_uchar;
-typedef texture_image<half> texture_image_half;
-typedef texture_image<float4> texture_image_float4;
-typedef texture_image<uchar4> texture_image_uchar4;
-typedef texture_image<half4> texture_image_half4;
-
 /* Macros to handle different memory storage on different devices */
 
 #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
@@ -524,9 +119,6 @@ typedef texture_image<half4> texture_image_half4;
 #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
 #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
-#define kernel_tex_image_interp(tex,x,y) kernel_tex_image_interp_impl(kg,tex,x,y)
-#define kernel_tex_image_interp_3d(tex, x, y, z) kernel_tex_image_interp_3d_impl(kg,tex,x,y,z)
-#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) kernel_tex_image_interp_3d_ex_impl(kg,tex, x, y, z, interpolation)
 #define kernel_tex_voxel_float(tex, x, y, z, sampling) (vdb_volume_sample_scalar(kg->vdb, kg->vdb_tdata, tex, x, y, z, sampling))
 #define kernel_tex_voxel_float3(tex, x, y, z, sampling) (vdb_volume_sample_vector(kg->vdb, kg->vdb_tdata, tex, x, y, z, sampling))
 
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index e0c7b17c6a0..fa512f80e41 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -33,71 +33,109 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <float.h>
+#include <stdint.h>
 
 /* Qualifier wrappers for different names on different devices */
 
 #define ccl_device  __device__ __inline__
+#if __CUDA_ARCH__ < 300
+#  define ccl_device_inline  __device__ __inline__
 #  define ccl_device_forceinline  __device__ __forceinline__
-#if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500)
+#elif __CUDA_ARCH__ < 500
 #  define ccl_device_inline  __device__ __forceinline__
+#  define ccl_device_forceinline  __device__ __forceinline__
 #else
 #  define ccl_device_inline  __device__ __inline__
+#  define ccl_device_forceinline  __device__ __forceinline__
 #endif
 #define ccl_device_noinline  __device__ __noinline__
 #define ccl_global
-#define ccl_constant
+#define ccl_static_constant __constant__
+#define ccl_constant const
+#define ccl_local __shared__
+#define ccl_local_param
+#define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
+/* TODO(sergey): In theory we might use references with CUDA, however
+ * performance impact yet to be investigated.
+ */
+#define ccl_ref
 #define ccl_align(n) __align__(n)
 
+#define ATTR_FALLTHROUGH
+
+#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
+
+
 /* No assert supported for CUDA */
 
 #define kernel_assert(cond)
 
 /* Types */
 
-#include "util_half.h"
-#include "util_types.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
+
+/* Work item functions */
+
+ccl_device_inline uint ccl_local_id(uint d)
+{
+	switch(d) {
+		case 0: return threadIdx.x;
+		case 1: return threadIdx.y;
+		case 2: return threadIdx.z;
+		default: return 0;
+	}
+}
+
+#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+
+ccl_device_inline uint ccl_local_size(uint d)
+{
+	switch(d) {
+		case 0: return blockDim.x;
+		case 1: return blockDim.y;
+		case 2: return blockDim.z;
+		default: return 0;
+	}
+}
+
+#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+
+ccl_device_inline uint ccl_group_id(uint d)
+{
+	switch(d) {
+		case 0: return blockIdx.x;
+		case 1: return blockIdx.y;
+		case 2: return blockIdx.z;
+		default: return 0;
+	}
+}
+
+ccl_device_inline uint ccl_num_groups(uint d)
+{
+	switch(d) {
+		case 0: return gridDim.x;
+		case 1: return gridDim.y;
+		case 2: return gridDim.z;
+		default: return 0;
+	}
+}
 
 /* Textures */
 
-typedef texture<float4, 1> texture_float4;
-typedef texture<float2, 1> texture_float2;
-typedef texture<float, 1> texture_float;
-typedef texture<uint, 1> texture_uint;
-typedef texture<int, 1> texture_int;
-typedef texture<uint4, 1> texture_uint4;
-typedef texture<uchar, 1> texture_uchar;
-typedef texture<uchar4, 1> texture_uchar4;
+/* Use arrays for regular data. This is a little slower than textures on Fermi,
+ * but allows for cleaner code and we will stop supporting Fermi soon. */
+#define kernel_tex_fetch(t, index) t[(index)]
+
+/* On Kepler (6xx) and above, we use Bindless Textures for images.
+ * On Fermi cards (4xx and 5xx), we have to use regular textures. */
+#if __CUDA_ARCH__ < 300
 typedef texture<float4, 2> texture_image_float4;
 typedef texture<float4, 3> texture_image3d_float4;
 typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
-
-/* Macros to handle different memory storage on different devices */
-
-/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images.
- * On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data.
- *
- * Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster.
- * Using Arrays on Fermi turned out to be slower.*/
-
-/* Fermi */
-#if __CUDA_ARCH__ < 300
-#  define __KERNEL_CUDA_TEX_STORAGE__
-#  define kernel_tex_fetch(t, index) tex1Dfetch(t, index)
-
-#  define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
-#  define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)
-
-/* Kepler */
-#else
-#  define kernel_tex_fetch(t, index) t[(index)]
-
-#  define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y)
-#  define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y)
-#  define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z)
-#  define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z)
 #endif
 
 #define kernel_data __data
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index f076e3a7d37..b02e3bc576d 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -36,11 +36,14 @@
 #define ccl_device_forceinline ccl_device
 #define ccl_device_noinline ccl_device ccl_noinline
 #define ccl_may_alias
+#define ccl_static_constant static __constant
 #define ccl_constant __constant
 #define ccl_global __global
 #define ccl_local __local
+#define ccl_local_param __local
 #define ccl_private __private
 #define ccl_restrict restrict
+#define ccl_ref
 #define ccl_align(n) __attribute__((aligned(n)))
 
 #ifdef __SPLIT_KERNEL__
@@ -49,6 +52,17 @@
 #  define ccl_addr_space
 #endif
 
+#define ATTR_FALLTHROUGH
+
+#define ccl_local_id(d) get_local_id(d)
+#define ccl_global_id(d) get_global_id(d)
+
+#define ccl_local_size(d) get_local_size(d)
+#define ccl_global_size(d) get_global_size(d)
+
+#define ccl_group_id(d) get_group_id(d)
+#define ccl_num_groups(d) get_num_groups(d)
+
 /* Selective nodes compilation. */
 #ifndef __NODES_MAX_GROUP__
 #  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
@@ -117,6 +131,7 @@
 #  define expf(x) native_exp(((float)(x)))
 #  define sqrtf(x) native_sqrt(((float)(x)))
 #  define logf(x) native_log(((float)(x)))
+#  define rcp(x)  native_recip(x)
 #else
 #  define sinf(x) sin(((float)(x)))
 #  define cosf(x) cos(((float)(x)))
@@ -124,17 +139,18 @@
 #  define expf(x) exp(((float)(x)))
 #  define sqrtf(x) sqrt(((float)(x)))
 #  define logf(x) log(((float)(x)))
+#  define rcp(x)  recip(x))
 #endif
 
 /* data lookup defines */
 #define kernel_data (*kg->data)
-#define kernel_tex_fetch(t, index) kg->t[index]
+#define kernel_tex_fetch(tex, index) ((const ccl_global tex##_t*)(kg->buffers[kg->tex.cl_buffer] + kg->tex.data))[(index)]
 
 /* define NULL */
 #define NULL 0
 
-#include "util_half.h"
-#include "util_types.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
 
 #endif /* __KERNEL_COMPAT_OPENCL_H__ */
 
diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h
deleted file mode 100644
index 24d6458567e..00000000000
--- a/intern/cycles/kernel/kernel_debug.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright 2011-2014 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void debug_data_init(DebugData *debug_data)
-{
-	debug_data->num_bvh_traversal_steps = 0;
-	debug_data->num_bvh_traversed_instances = 0;
-	debug_data->num_ray_bounces = 0;
-}
-
-ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
-                                                 ccl_global float *buffer,
-                                                 ccl_addr_space PathState *state,
-                                                 DebugData *debug_data,
-                                                 int sample)
-{
-	int flag = kernel_data.film.pass_flag;
-	if(flag & PASS_BVH_TRAVERSAL_STEPS) {
-		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversal_steps,
-		                        sample,
-		                        debug_data->num_bvh_traversal_steps);
-	}
-	if(flag & PASS_BVH_TRAVERSED_INSTANCES) {
-		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances,
-		                        sample,
-		                        debug_data->num_bvh_traversed_instances);
-	}
-	if(flag & PASS_RAY_BOUNCES) {
-		kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces,
-		                        sample,
-		                        debug_data->num_ray_bounces);
-	}
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 8c7c651a053..45b8c6311e1 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -37,16 +37,14 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		ray.D = ls->D;
 		ray.P = ls->P;
 		ray.t = 1.0f;
-#  ifdef __OBJECT_MOTION__
 		ray.time = time;
-#  endif
 		ray.dP = differential3_zero();
 		ray.dD = dI;
 
 		shader_setup_from_background(kg, emission_sd, &ray);
 
 		path_state_modify_bounce(state, true);
-		eval = shader_eval_background(kg, emission_sd, state, 0, SHADER_CONTEXT_EMISSION);
+		eval = shader_eval_background(kg, emission_sd, state, 0);
 		path_state_modify_bounce(state, false);
 	}
 	else
@@ -67,16 +65,16 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		                         ls->shader, ls->object, ls->prim,
 		                         ls->u, ls->v, t, time, false, ls->lamp);
 
-		ls->Ng = ccl_fetch(emission_sd, Ng);
+		ls->Ng = emission_sd->Ng;
 
 		/* no path flag, we're evaluating this for all closures. that's weak but
 		 * we'd have to do multiple evaluations otherwise */
 		path_state_modify_bounce(state, true);
-		shader_eval_surface(kg, emission_sd, NULL, state, 0.0f, 0, SHADER_CONTEXT_EMISSION);
+		shader_eval_surface(kg, emission_sd, state, 0);
 		path_state_modify_bounce(state, false);
 
 		/* evaluate emissive closure */
-		if(ccl_fetch(emission_sd, flag) & SD_EMISSION)
+		if(emission_sd->flag & SD_EMISSION)
 			eval = shader_emissive_eval(kg, emission_sd);
 		else
 			eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -112,7 +110,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	                                         -ls->D,
 	                                         dD,
 	                                         ls->t,
-	                                         ccl_fetch(sd, time));
+	                                         sd->time);
 
 	if(is_zero(light_eval))
 		return false;
@@ -120,7 +118,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	/* evaluate BSDF at shading point */
 
 #ifdef __VOLUME__
-	if(ccl_fetch(sd, prim) != PRIM_NONE)
+	if(sd->prim != PRIM_NONE)
 		shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
 	else {
 		float bsdf_pdf;
@@ -156,8 +154,13 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	if(bsdf_eval_is_zero(eval))
 		return false;
 
-	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-		float probability = max3(bsdf_eval_sum(eval)) * kernel_data.integrator.light_inv_rr_threshold;
+	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f
+#ifdef __SHADOW_TRICKS__
+	   && (state->flag & PATH_RAY_SHADOW_CATCHER) == 0
+#endif
+	  )
+	{
+		float probability = max3(fabs(bsdf_eval_sum(eval))) * kernel_data.integrator.light_inv_rr_threshold;
 		if(probability < 1.0f) {
 			if(rand_terminate >= probability) {
 				return false;
@@ -168,8 +171,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 
 	if(ls->shader & SHADER_CAST_SHADOW) {
 		/* setup ray */
-		bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f);
-		ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+		bool transmit = (dot(sd->Ng, ls->D) < 0.0f);
+		ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng);
 
 		if(ls->t == FLT_MAX) {
 			/* distant light */
@@ -182,7 +185,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 			ray->D = normalize_len(ray->D, &ray->t);
 		}
 
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 		ray->dD = differential3_zero();
 	}
 	else {
@@ -204,14 +207,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 	float3 L = shader_emissive_eval(kg, sd);
 
 #ifdef __HAIR__
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE))
 #else
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
 #endif
 	{
 		/* multiple importance sampling, get triangle light pdf,
 		 * and compute weight with respect to BSDF pdf */
-		float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t);
+		float pdf = triangle_light_pdf(kg, sd, t);
 		float mis_weight = power_heuristic(bsdf_pdf, pdf);
 
 		return L*mis_weight;
@@ -314,7 +317,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg,
 #  endif
 
 	path_state_modify_bounce(state, true);
-	float3 L = shader_eval_background(kg, emission_sd, state, state->flag, SHADER_CONTEXT_EMISSION);
+	float3 L = shader_eval_background(kg, emission_sd, state, state->flag);
 	path_state_modify_bounce(state, false);
 
 #ifdef __BACKGROUND_MIS__
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 74357bd96fc..7e2f67bbd63 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -16,6 +16,17 @@
 
 /* Constant Globals */
 
+#ifndef __KERNEL_GLOBALS_H__
+#define __KERNEL_GLOBALS_H__
+
+#ifdef __KERNEL_CPU__
+#  include "util/util_vector.h"
+#endif
+
+#ifdef __KERNEL_OPENCL__
+#  include "util/util_atomic.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
@@ -42,16 +53,9 @@ struct VolumeStep;
 #  define MAX_VOLUME        1024
 
 typedef struct KernelGlobals {
-	texture_image_uchar4 texture_byte4_images[TEX_NUM_BYTE4_CPU];
-	texture_image_float4 texture_float4_images[TEX_NUM_FLOAT4_CPU];
-	texture_image_half4 texture_half4_images[TEX_NUM_HALF4_CPU];
-	texture_image_float texture_float_images[TEX_NUM_FLOAT_CPU];
-	texture_image_uchar texture_byte_images[TEX_NUM_BYTE_CPU];
-	texture_image_half texture_half_images[TEX_NUM_HALF_CPU];
-
-#  define KERNEL_TEX(type, ttype, name) ttype name;
+#  define KERNEL_TEX(type, name) texture<type> name;
 #  define KERNEL_IMAGE_TEX(type, ttype, name)
-#  include "kernel_textures.h"
+#  include "kernel/kernel_textures.h"
 
 	KernelData __data;
 
@@ -72,7 +76,15 @@ typedef struct KernelGlobals {
 	VolumeStep *decoupled_volume_steps[2];
 	int decoupled_volume_steps_index;
 
+	/* split kernel */
+	SplitData split_data;
+	SplitParams split_param_data;
+
+	int2 global_size;
+	int2 global_id;
+
 #  ifdef WITH_OPENVDB
+	/* OpenVDB */
 	OpenVDBGlobals *vdb;
 	OpenVDBThreadData *vdb_tdata;
 #  endif
@@ -88,15 +100,14 @@ typedef struct KernelGlobals {
 #ifdef __KERNEL_CUDA__
 
 __constant__ KernelData __data;
-typedef struct KernelGlobals {} KernelGlobals;
+typedef struct KernelGlobals {
+	/* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
+	Intersection hits_stack[64];
+} KernelGlobals;
 
-#  ifdef __KERNEL_CUDA_TEX_STORAGE__
-#    define KERNEL_TEX(type, ttype, name) ttype name;
-#  else
-#    define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name;
-#  endif
+#  define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
 #  define KERNEL_IMAGE_TEX(type, ttype, name) ttype name;
-#  include "kernel_textures.h"
+#  include "kernel/kernel_textures.h"
 
 #endif  /* __KERNEL_CUDA__ */
 
@@ -104,19 +115,75 @@ typedef struct KernelGlobals {} KernelGlobals;
 
 #ifdef __KERNEL_OPENCL__
 
+#  define KERNEL_TEX(type, name) \
+typedef type name##_t;
+#  include "kernel/kernel_textures.h"
+
 typedef ccl_addr_space struct KernelGlobals {
 	ccl_constant KernelData *data;
+	ccl_global char *buffers[8];
 
-#  define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name;
-#  include "kernel_textures.h"
+#  define KERNEL_TEX(type, name) \
+	TextureInfo name;
+#  include "kernel/kernel_textures.h"
 
 #  ifdef __SPLIT_KERNEL__
-	ShaderData *sd_input;
-	Intersection *isect_shadow;
+	SplitData split_data;
+	SplitParams split_param_data;
 #  endif
 } KernelGlobals;
 
+#define KERNEL_BUFFER_PARAMS \
+	ccl_global char *buffer0, \
+	ccl_global char *buffer1, \
+	ccl_global char *buffer2, \
+	ccl_global char *buffer3, \
+	ccl_global char *buffer4, \
+	ccl_global char *buffer5, \
+	ccl_global char *buffer6, \
+	ccl_global char *buffer7
+
+#define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7
+
+ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS)
+{
+#ifdef __SPLIT_KERNEL__
+	if(ccl_local_id(0) + ccl_local_id(1) == 0)
+#endif
+	{
+		kg->buffers[0] = buffer0;
+		kg->buffers[1] = buffer1;
+		kg->buffers[2] = buffer2;
+		kg->buffers[3] = buffer3;
+		kg->buffers[4] = buffer4;
+		kg->buffers[5] = buffer5;
+		kg->buffers[6] = buffer6;
+		kg->buffers[7] = buffer7;
+	}
+
+#  ifdef __SPLIT_KERNEL__
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+#  endif
+}
+
+ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg)
+{
+#  ifdef __SPLIT_KERNEL__
+	if(ccl_local_id(0) + ccl_local_id(1) == 0)
+#  endif
+	{
+		ccl_global TextureInfo *info = (ccl_global TextureInfo*)kg->buffers[0];
+
+#  define KERNEL_TEX(type, name) \
+		kg->name = *(info++);
+#  include "kernel/kernel_textures.h"
+	}
+
+#  ifdef __SPLIT_KERNEL__
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+#  endif
+}
+
 #endif  /* __KERNEL_OPENCL__ */
 
 /* Interpolated lookup table access */
@@ -155,3 +222,4 @@ ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int o
 
 CCL_NAMESPACE_END
 
+#endif  /* __KERNEL_GLOBALS_H__ */
diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h
deleted file mode 100644
index 0352c58037d..00000000000
--- a/intern/cycles/kernel/kernel_image_opencl.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright 2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-/* For OpenCL all images are packed in a single array, and we do manual lookup
- * and interpolation. */
-
-ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
-{
-	/* Float4 */
-	if(id < TEX_START_BYTE4_OPENCL) {
-		return kernel_tex_fetch(__tex_image_float4_packed, offset);
-	}
-	/* Byte4 */
-	else if(id < TEX_START_FLOAT_OPENCL) {
-		uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset);
-		float f = 1.0f/255.0f;
-		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
-	}
-	/* Float */
-	else if(id < TEX_START_BYTE_OPENCL) {
-		float f = kernel_tex_fetch(__tex_image_float_packed, offset);
-		return make_float4(f, f, f, 1.0f);
-	}
-	/* Byte */
-	else {
-		uchar r = kernel_tex_fetch(__tex_image_byte_packed, offset);
-		float f = r * (1.0f/255.0f);
-		return make_float4(f, f, f, 1.0f);
-	}
-}
-
-ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
-{
-	x %= width;
-	if(x < 0)
-		x += width;
-	return x;
-}
-
-ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
-{
-	return clamp(x, 0, width-1);
-}
-
-ccl_device_inline float svm_image_texture_frac(float x, int *ix)
-{
-	int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
-	*ix = i;
-	return x - (float)i;
-}
-
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
-{
-	uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
-	uint width = info.x;
-	uint height = info.y;
-	uint offset = info.z;
-
-	/* Image Options */
-	uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
-	uint extension;
-	if(info.w & (1 << 1))
-		extension = EXTENSION_REPEAT;
-	else if(info.w & (1 << 2))
-		extension = EXTENSION_EXTEND;
-	else
-		extension = EXTENSION_CLIP;
-
-	float4 r;
-	int ix, iy, nix, niy;
-	if(interpolation == INTERPOLATION_CLOSEST) {
-		svm_image_texture_frac(x*width, &ix);
-		svm_image_texture_frac(y*height, &iy);
-
-		if(extension == EXTENSION_REPEAT) {
-			ix = svm_image_texture_wrap_periodic(ix, width);
-			iy = svm_image_texture_wrap_periodic(iy, height);
-		}
-		else {
-			if(extension == EXTENSION_CLIP) {
-				if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				}
-			}
-			/* Fall through. */
-			/* EXTENSION_EXTEND */
-			ix = svm_image_texture_wrap_clamp(ix, width);
-			iy = svm_image_texture_wrap_clamp(iy, height);
-		}
-
-		r = svm_image_texture_read(kg, id, offset + ix + iy*width);
-	}
-	else { /* INTERPOLATION_LINEAR */
-		float tx = svm_image_texture_frac(x*width - 0.5f, &ix);
-		float ty = svm_image_texture_frac(y*height - 0.5f, &iy);
-
-		if(extension == EXTENSION_REPEAT) {
-			ix = svm_image_texture_wrap_periodic(ix, width);
-			iy = svm_image_texture_wrap_periodic(iy, height);
-
-			nix = svm_image_texture_wrap_periodic(ix+1, width);
-			niy = svm_image_texture_wrap_periodic(iy+1, height);
-		}
-		else {
-			if(extension == EXTENSION_CLIP) {
-				if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				}
-			}
-			nix = svm_image_texture_wrap_clamp(ix+1, width);
-			niy = svm_image_texture_wrap_clamp(iy+1, height);
-			ix = svm_image_texture_wrap_clamp(ix, width);
-			iy = svm_image_texture_wrap_clamp(iy, height);
-		}
-
-		r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width);
-		r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width);
-		r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width);
-		r += ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width);
-	}
-
-	return r;
-}
-
-
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z)
-{
-	uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
-	uint width = info.x;
-	uint height = info.y;
-	uint offset = info.z;
-	uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x;
-
-	/* Image Options */
-	uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
-	uint extension;
-	if(info.w & (1 << 1))
-		extension = EXTENSION_REPEAT;
-	else if(info.w & (1 << 2))
-		extension = EXTENSION_EXTEND;
-	else
-		extension = EXTENSION_CLIP;
-
-	float4 r;
-	int ix, iy, iz, nix, niy, niz;
-	if(interpolation == INTERPOLATION_CLOSEST) {
-		svm_image_texture_frac(x*width, &ix);
-		svm_image_texture_frac(y*height, &iy);
-		svm_image_texture_frac(z*depth, &iz);
-
-		if(extension == EXTENSION_REPEAT) {
-			ix = svm_image_texture_wrap_periodic(ix, width);
-			iy = svm_image_texture_wrap_periodic(iy, height);
-			iz = svm_image_texture_wrap_periodic(iz, depth);
-		}
-		else {
-			if(extension == EXTENSION_CLIP) {
-				if(x < 0.0f || y < 0.0f || z < 0.0f ||
-				   x > 1.0f || y > 1.0f || z > 1.0f)
-				 {
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				}
-			}
-			/* Fall through. */
-			/* EXTENSION_EXTEND */
-			ix = svm_image_texture_wrap_clamp(ix, width);
-			iy = svm_image_texture_wrap_clamp(iy, height);
-			iz = svm_image_texture_wrap_clamp(iz, depth);
-		}
-		r = svm_image_texture_read(kg, id, offset + ix + iy*width + iz*width*height);
-	}
-	else { /* INTERPOLATION_LINEAR */
-		float tx = svm_image_texture_frac(x*(float)width - 0.5f, &ix);
-		float ty = svm_image_texture_frac(y*(float)height - 0.5f, &iy);
-		float tz = svm_image_texture_frac(z*(float)depth - 0.5f, &iz);
-
-		if(extension == EXTENSION_REPEAT) {
-			ix = svm_image_texture_wrap_periodic(ix, width);
-			iy = svm_image_texture_wrap_periodic(iy, height);
-			iz = svm_image_texture_wrap_periodic(iz, depth);
-
-			nix = svm_image_texture_wrap_periodic(ix+1, width);
-			niy = svm_image_texture_wrap_periodic(iy+1, height);
-			niz = svm_image_texture_wrap_periodic(iz+1, depth);
-		}
-		else {
-			if(extension == EXTENSION_CLIP)
-				if(x < 0.0f || y < 0.0f || z < 0.0f ||
-				   x > 1.0f || y > 1.0f || z > 1.0f)
-				{
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				}
-			/* Fall through. */
-			/*  EXTENSION_EXTEND */
-			nix = svm_image_texture_wrap_clamp(ix+1, width);
-			niy = svm_image_texture_wrap_clamp(iy+1, height);
-			niz = svm_image_texture_wrap_clamp(iz+1, depth);
-
-			ix = svm_image_texture_wrap_clamp(ix, width);
-			iy = svm_image_texture_wrap_clamp(iy, height);
-			iz = svm_image_texture_wrap_clamp(iz, depth);
-		}
-
-		r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width + iz*width*height);
-		r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + iz*width*height);
-		r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + iz*width*height);
-		r += (1.0f - tz)*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + iz*width*height);
-
-		r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width + niz*width*height);
-		r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + niz*width*height);
-		r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + niz*width*height);
-		r += tz*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + niz*width*height);
-
-	}
-
-	return r;
-}
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index aec7bc33acd..f5855757d3f 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -149,6 +149,15 @@ ccl_device_inline uint cmj_hash(uint i, uint p)
 	return i;
 }
 
+ccl_device_inline uint cmj_hash_simple(uint i, uint p)
+{
+	i = (i ^ 61) ^ p;
+	i += i << 3;
+	i ^= i >> 4;
+	i *= 0x27d4eb2d;
+	return i;
+}
+
 ccl_device_inline float cmj_randfloat(uint i, uint p)
 {
 	return cmj_hash(i, p) * (1.0f / 4294967808.0f);
@@ -166,15 +175,26 @@ ccl_device float cmj_sample_1D(int s, int N, int p)
 	return (x + jx)*invN;
 }
 
-ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
+/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */
+ccl_device_inline int cmj_isqrt(int value)
 {
-	kernel_assert(s < N);
-
 #if defined(__KERNEL_CUDA__)
-	int m = float_to_int(__fsqrt_ru(N));
+	return float_to_int(__fsqrt_ru(value));
+#elif defined(__KERNEL_GPU__)
+	return float_to_int(sqrtf(value));
 #else
-	int m = float_to_int(sqrtf(N));
+	/* This is a work around for fast-math on CPU which might replace sqrtf()
+	 * with am approximated version.
+	 */
+	return float_to_int(sqrtf(value) + 1e-6f);
 #endif
+}
+
+ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
+{
+	kernel_assert(s < N);
+
+	int m = cmj_isqrt(N);
 	int n = (N - 1)/m + 1;
 	float invN = 1.0f/N;
 	float invm = 1.0f/m;
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index d4cc36d1495..c806deee8e7 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P,
 		float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
 		cu = clamp(cu, -1.0f, 1.0f);
 		/* Compute xu. */
-		float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+		float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
 		xu = clamp(xu, x0, x1);
 		/* Compute yv. */
 		float z0sq = z0 * z0;
@@ -396,11 +396,13 @@ ccl_device_inline float3 background_light_sample(KernelGlobals *kg,
 					     + (1.0f - portal_sampling_pdf) * cdf_pdf);
 				}
 				return D;
-			} else {
+			}
+			else {
 				/* Sample map, but with nonzero portal_sampling_pdf for MIS. */
 				randu = (randu - portal_sampling_pdf) / (1.0f - portal_sampling_pdf);
 			}
-		} else {
+		}
+		else {
 			/* We can't sample a portal.
 			 * Check if we can sample the map instead.
 			 */
@@ -763,78 +765,280 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 
 /* Triangle Light */
 
-ccl_device void object_transform_light_sample(KernelGlobals *kg, LightSample *ls, int object, float time)
+/* returns true if the triangle is has motion blur or an instancing transform applied */
+ccl_device_inline bool triangle_world_space_vertices(KernelGlobals *kg, int object, int prim, float time, float3 V[3])
 {
+	bool has_motion = false;
+	const int object_flag = kernel_tex_fetch(__object_flag, object);
+
+	if(object_flag & SD_OBJECT_HAS_VERTEX_MOTION && time >= 0.0f) {
+		motion_triangle_vertices(kg, object, prim, time, V);
+		has_motion = true;
+	}
+	else {
+		triangle_vertices(kg, prim, V);
+	}
+
 #ifdef __INSTANCING__
-	/* instance transform */
-	if(!(kernel_tex_fetch(__object_flag, object) & SD_TRANSFORM_APPLIED)) {
+	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #  ifdef __OBJECT_MOTION__
-		Transform itfm;
-		Transform tfm = object_fetch_transform_motion_test(kg, object, time, &itfm);
+		Transform tfm = object_fetch_transform_motion_test(kg, object, time, NULL);
 #  else
 		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
 #  endif
-
-		ls->P = transform_point(&tfm, ls->P);
-		ls->Ng = normalize(transform_direction(&tfm, ls->Ng));
+		V[0] = transform_point(&tfm, V[0]);
+		V[1] = transform_point(&tfm, V[1]);
+		V[2] = transform_point(&tfm, V[2]);
+		has_motion = true;
 	}
 #endif
+	return has_motion;
 }
 
-ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object,
-	float randu, float randv, float time, LightSample *ls)
+ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
 {
-	float u, v;
+	float pdf = kernel_data.integrator.pdf_triangles;
+	float cos_pi = fabsf(dot(Ng, I));
 
-	/* compute random point in triangle */
-	randu = sqrtf(randu);
+	if(cos_pi == 0.0f)
+		return 0.0f;
+
+	return t*t*pdf/cos_pi;
+}
 
-	u = 1.0f - randu;
-	v = randv*randu;
+ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *sd, float t)
+{
+	/* A naive heuristic to decide between costly solid angle sampling
+	 * and simple area sampling, comparing the distance to the triangle plane
+	 * to the length of the edges of the triangle. */
+
+	float3 V[3];
+	bool has_motion = triangle_world_space_vertices(kg, sd->object, sd->prim, sd->time, V);
+
+	const float3 e0 = V[1] - V[0];
+	const float3 e1 = V[2] - V[0];
+	const float3 e2 = V[2] - V[1];
+	const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2)));
+	const float3 N = cross(e0, e1);
+	const float distance_to_plane = fabsf(dot(N, sd->I * t))/dot(N, N);
+
+	if(longest_edge_squared > distance_to_plane*distance_to_plane) {
+		/* sd contains the point on the light source
+		 * calculate Px, the point that we're shading */
+		const float3 Px = sd->P + sd->I * t;
+		const float3 v0_p = V[0] - Px;
+		const float3 v1_p = V[1] - Px;
+		const float3 v2_p = V[2] - Px;
+
+		const float3 u01 = safe_normalize(cross(v0_p, v1_p));
+		const float3 u02 = safe_normalize(cross(v0_p, v2_p));
+		const float3 u12 = safe_normalize(cross(v1_p, v2_p));
+
+		const float alpha = fast_acosf(dot(u02, u01));
+		const float beta = fast_acosf(-dot(u01, u12));
+		const float gamma = fast_acosf(dot(u02, u12));
+		const float solid_angle =  alpha + beta + gamma - M_PI_F;
+
+		/* pdf_triangles is calculated over triangle area, but we're not sampling over its area */
+		if(UNLIKELY(solid_angle == 0.0f)) {
+			return 0.0f;
+		}
+		else {
+			float area = 1.0f;
+			if(has_motion) {
+				/* get the center frame vertices, this is what the PDF was calculated from */
+				triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V);
+				area = triangle_area(V[0], V[1], V[2]);
+			}
+			else {
+				area = 0.5f * len(N);
+			}
+			const float pdf = area * kernel_data.integrator.pdf_triangles;
+			return pdf / solid_angle;
+		}
+	}
+	else {
+		float pdf = triangle_light_pdf_area(kg, sd->Ng, sd->I, t);
+		if(has_motion) {
+			const float	area = 0.5f * len(N);
+			if(UNLIKELY(area == 0.0f)) {
+				return 0.0f;
+			}
+			/* scale the PDF.
+			 * area = the area the sample was taken from
+			 * area_pre = the are from which pdf_triangles was calculated from */
+			triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V);
+			const float area_pre = triangle_area(V[0], V[1], V[2]);
+			pdf = pdf * area_pre / area;
+		}
+		return pdf;
+	}
+}
 
-	/* triangle, so get position, normal, shader */
-	triangle_point_normal(kg, object, prim, u, v, &ls->P, &ls->Ng, &ls->shader);
+ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg, int prim, int object,
+	float randu, float randv, float time, LightSample *ls, const float3 P)
+{
+	/* A naive heuristic to decide between costly solid angle sampling
+	 * and simple area sampling, comparing the distance to the triangle plane
+	 * to the length of the edges of the triangle. */
+
+	float3 V[3];
+	bool has_motion = triangle_world_space_vertices(kg, object, prim, time, V);
+
+	const float3 e0 = V[1] - V[0];
+	const float3 e1 = V[2] - V[0];
+	const float3 e2 = V[2] - V[1];
+	const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2)));
+	const float3 N0 = cross(e0, e1);
+	float Nl = 0.0f;
+	ls->Ng = safe_normalize_len(N0, &Nl);
+	float area = 0.5f * Nl;
+
+	/* flip normal if necessary */
+	const int object_flag = kernel_tex_fetch(__object_flag, object);
+	if(object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+		ls->Ng = -ls->Ng;
+	}
+	ls->eval_fac = 1.0f;
+	ls->shader = kernel_tex_fetch(__tri_shader, prim);
 	ls->object = object;
 	ls->prim = prim;
 	ls->lamp = LAMP_NONE;
 	ls->shader |= SHADER_USE_MIS;
-	ls->t = 0.0f;
-	ls->u = u;
-	ls->v = v;
 	ls->type = LIGHT_TRIANGLE;
-	ls->eval_fac = 1.0f;
 
-	object_transform_light_sample(kg, ls, object, time);
-}
+	float distance_to_plane = fabsf(dot(N0, V[0] - P)/dot(N0, N0));
+
+	if(longest_edge_squared > distance_to_plane*distance_to_plane) {
+		/* see James Arvo, "Stratified Sampling of Spherical Triangles"
+		 * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */
+
+		/* project the triangle to the unit sphere
+		 * and calculate its edges and angles */
+		const float3 v0_p = V[0] - P;
+		const float3 v1_p = V[1] - P;
+		const float3 v2_p = V[2] - P;
+
+		const float3 u01 = safe_normalize(cross(v0_p, v1_p));
+		const float3 u02 = safe_normalize(cross(v0_p, v2_p));
+		const float3 u12 = safe_normalize(cross(v1_p, v2_p));
+
+		const float3 A = safe_normalize(v0_p);
+		const float3 B = safe_normalize(v1_p);
+		const float3 C = safe_normalize(v2_p);
+
+		const float cos_alpha = dot(u02, u01);
+		const float cos_beta = -dot(u01, u12);
+		const float cos_gamma = dot(u02, u12);
+
+		/* calculate dihedral angles */
+		const float alpha = fast_acosf(cos_alpha);
+		const float beta = fast_acosf(cos_beta);
+		const float gamma = fast_acosf(cos_gamma);
+		/* the area of the unit spherical triangle = solid angle */
+		const float solid_angle =  alpha + beta + gamma - M_PI_F;
+
+		/* precompute a few things
+		 * these could be re-used to take several samples
+		 * as they are independent of randu/randv */
+		const float cos_c = dot(A, B);
+		const float sin_alpha = fast_sinf(alpha);
+		const float product = sin_alpha * cos_c;
+
+		/* Select a random sub-area of the spherical triangle
+		 * and calculate the third vertex C_ of that new triangle */
+		const float phi = randu * solid_angle - alpha;
+		float s, t;
+		fast_sincosf(phi, &s, &t);
+		const float u = t - cos_alpha;
+		const float v = s + product;
+
+		const float3 U = safe_normalize(C - dot(C, A) * A);
+
+		float q = 1.0f;
+		const float det = ((v * s + u * t) * sin_alpha);
+		if(det != 0.0f) {
+			q = ((v * t - u * s) * cos_alpha - v) / det;
+		}
+		const float temp = max(1.0f - q*q, 0.0f);
 
-ccl_device float triangle_light_pdf(KernelGlobals *kg,
-	const float3 Ng, const float3 I, float t)
-{
-	float pdf = kernel_data.integrator.pdf_triangles;
-	float cos_pi = fabsf(dot(Ng, I));
+		const float3 C_ = safe_normalize(q * A + sqrtf(temp) * U);
 
-	if(cos_pi == 0.0f)
-		return 0.0f;
-	
-	return t*t*pdf/cos_pi;
+		/* Finally, select a random point along the edge of the new triangle
+		 * That point on the spherical triangle is the sampled ray direction */
+		const float z = 1.0f - randv * (1.0f - dot(C_, B));
+		ls->D = z * B + safe_sqrtf(1.0f - z*z) * safe_normalize(C_ - dot(C_, B) * B);
+
+		/* calculate intersection with the planar triangle */
+		if(!ray_triangle_intersect(P, ls->D, FLT_MAX,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+		                           (ssef*)V,
+#else
+		                           V[0], V[1], V[2],
+#endif
+		                           &ls->u, &ls->v, &ls->t)) {
+			ls->pdf = 0.0f;
+			return;
+		}
+
+		ls->P = P + ls->D * ls->t;
+
+		/* pdf_triangles is calculated over triangle area, but we're sampling over solid angle */
+		if(UNLIKELY(solid_angle == 0.0f)) {
+			ls->pdf = 0.0f;
+			return;
+		}
+		else {
+			if(has_motion) {
+				/* get the center frame vertices, this is what the PDF was calculated from */
+				triangle_world_space_vertices(kg, object, prim, -1.0f, V);
+				area = triangle_area(V[0], V[1], V[2]);
+			}
+			const float pdf = area * kernel_data.integrator.pdf_triangles;
+			ls->pdf = pdf / solid_angle;
+		}
+	}
+	else {
+		/* compute random point in triangle */
+		randu = sqrtf(randu);
+
+		const float u = 1.0f - randu;
+		const float v = randv*randu;
+		const float t = 1.0f - u - v;
+		ls->P = u * V[0] + v * V[1] + t * V[2];
+		/* compute incoming direction, distance and pdf */
+		ls->D = normalize_len(ls->P - P, &ls->t);
+		ls->pdf = triangle_light_pdf_area(kg, ls->Ng, -ls->D, ls->t);
+		if(has_motion && area != 0.0f) {
+			/* scale the PDF.
+			 * area = the area the sample was taken from
+			 * area_pre = the are from which pdf_triangles was calculated from */
+			triangle_world_space_vertices(kg, object, prim, -1.0f, V);
+			const float area_pre = triangle_area(V[0], V[1], V[2]);
+			ls->pdf = ls->pdf * area_pre / area;
+		}
+		ls->u = u;
+		ls->v = v;
+	}
 }
 
 /* Light Distribution */
 
-ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
+ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
 {
-	/* this is basically std::upper_bound as used by pbrt, to find a point light or
+	/* This is basically std::upper_bound as used by pbrt, to find a point light or
 	 * triangle to emit from, proportional to area. a good improvement would be to
 	 * also sample proportional to power, though it's not so well defined with
-	 * OSL shaders. */
+	 * arbitrary shaders. */
 	int first = 0;
 	int len = kernel_data.integrator.num_distribution + 1;
+	float r = *randu;
 
 	while(len > 0) {
 		int half_len = len >> 1;
 		int middle = first + half_len;
 
-		if(randt < kernel_tex_fetch(__light_distribution, middle).x) {
+		if(r < kernel_tex_fetch(__light_distribution, middle).x) {
 			len = half_len;
 		}
 		else {
@@ -843,9 +1047,17 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
 		}
 	}
 
-	/* clamping should not be needed but float rounding errors seem to
-	 * make this fail on rare occasions */
-	return clamp(first-1, 0, kernel_data.integrator.num_distribution-1);
+	/* Clamping should not be needed but float rounding errors seem to
+	 * make this fail on rare occasions. */
+	int index = clamp(first-1, 0, kernel_data.integrator.num_distribution-1);
+
+	/* Rescale to reuse random number. this helps the 2D samples within
+	 * each area light be stratified as well. */
+	float distr_min = kernel_tex_fetch(__light_distribution, index).x;
+	float distr_max = kernel_tex_fetch(__light_distribution, index+1).x;
+	*randu = (r - distr_min)/(distr_max - distr_min);
+
+	return index;
 }
 
 /* Generic Light */
@@ -857,7 +1069,6 @@ ccl_device bool light_select_reached_max_bounces(KernelGlobals *kg, int index, i
 }
 
 ccl_device_noinline bool light_sample(KernelGlobals *kg,
-                                      float randt,
                                       float randu,
                                       float randv,
                                       float time,
@@ -866,7 +1077,7 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg,
                                       LightSample *ls)
 {
 	/* sample index */
-	int index = light_distribution_sample(kg, randt);
+	int index = light_distribution_sample(kg, &randu);
 
 	/* fetch light data */
 	float4 l = kernel_tex_fetch(__light_distribution, index);
@@ -876,10 +1087,7 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg,
 		int object = __float_as_int(l.w);
 		int shader_flag = __float_as_int(l.z);
 
-		triangle_light_sample(kg, prim, object, randu, randv, time, ls);
-		/* compute incoming direction, distance and pdf */
-		ls->D = normalize_len(ls->P - P, &ls->t);
-		ls->pdf = triangle_light_pdf(kg, ls->Ng, -ls->D, ls->t);
+		triangle_light_sample(kg, prim, object, randu, randv, time, ls, P);
 		ls->shader |= shader_flag;
 		return (ls->pdf > 0.0f);
 	}
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index 9bee5603474..bd0e23b7705 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -17,11 +17,11 @@
 #ifndef __KERNEL_MATH_H__
 #define __KERNEL_MATH_H__
 
-#include "util_color.h"
-#include "util_math.h"
-#include "util_math_fast.h"
-#include "util_texture.h"
-#include "util_transform.h"
+#include "util/util_color.h"
+#include "util/util_math.h"
+#include "util/util_math_fast.h"
+#include "util/util_math_intersect.h"
+#include "util/util_texture.h"
+#include "util/util_transform.h"
 
 #endif /* __KERNEL_MATH_H__ */
-
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index af7b727c1ba..9995490505f 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -67,8 +67,8 @@ ccl_device_inline void sample_cos_hemisphere(const float3 N,
 
 /* sample direction uniformly distributed in hemisphere */
 ccl_device_inline void sample_uniform_hemisphere(const float3 N,
-                                               float randu, float randv,
-                                               float3 *omega_in, float *pdf)
+                                                 float randu, float randv,
+                                                 float3 *omega_in, float *pdf)
 {
 	float z = randu;
 	float r = sqrtf(max(0.0f, 1.0f - z*z));
@@ -84,8 +84,8 @@ ccl_device_inline void sample_uniform_hemisphere(const float3 N,
 
 /* sample direction uniformly distributed in cone */
 ccl_device_inline void sample_uniform_cone(const float3 N, float angle,
-                                         float randu, float randv,
-                                         float3 *omega_in, float *pdf)
+                                           float randu, float randv,
+                                           float3 *omega_in, float *pdf)
 {
 	float z = cosf(angle*randu);
 	float r = sqrtf(max(0.0f, 1.0f - z*z));
@@ -187,4 +187,3 @@ ccl_device float2 regular_polygon_sample(float corners, float rotation, float u,
 CCL_NAMESPACE_END
 
 #endif /* __KERNEL_MONTECARLO_CL__ */
-
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 7aec47e4957..b31356905f2 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -16,19 +16,23 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value)
+#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__)
+#define __ATOMIC_PASS_WRITE__
+#endif
+
+ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value)
 {
 	ccl_global float *buf = buffer;
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#ifdef __ATOMIC_PASS_WRITE__
 	atomic_add_and_fetch_float(buf, value);
 #else
-	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+	*buf += value;
+#endif
 }
 
-ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value)
+ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value)
 {
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#ifdef __ATOMIC_PASS_WRITE__
 	ccl_global float *buf_x = buffer + 0;
 	ccl_global float *buf_y = buffer + 1;
 	ccl_global float *buf_z = buffer + 2;
@@ -38,13 +42,13 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sa
 	atomic_add_and_fetch_float(buf_z, value.z);
 #else
 	ccl_global float3 *buf = (ccl_global float3*)buffer;
-	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+	*buf += value;
+#endif
 }
 
-ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value)
+ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value)
 {
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#ifdef __ATOMIC_PASS_WRITE__
 	ccl_global float *buf_x = buffer + 0;
 	ccl_global float *buf_y = buffer + 1;
 	ccl_global float *buf_z = buffer + 2;
@@ -56,12 +60,137 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
 	atomic_add_and_fetch_float(buf_w, value.w);
 #else
 	ccl_global float4 *buf = (ccl_global float4*)buffer;
-	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+	*buf += value;
+#endif
+}
+
+#ifdef __DENOISING_FEATURES__
+ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, float value)
+{
+	kernel_write_pass_float(buffer, value);
+
+	/* The online one-pass variance update that's used for the megakernel can't easily be implemented
+	 * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
+	kernel_write_pass_float(buffer+1, value*value);
 }
 
+#  ifdef __ATOMIC_PASS_WRITE__
+#    define kernel_write_pass_float3_unaligned kernel_write_pass_float3
+#  else
+ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value)
+{
+	buffer[0] += value.x;
+	buffer[1] += value.y;
+	buffer[2] += value.z;
+}
+#  endif
+
+ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, float3 value)
+{
+	kernel_write_pass_float3_unaligned(buffer, value);
+	kernel_write_pass_float3_unaligned(buffer+3, value*value);
+}
+
+ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
+	int sample, float path_total, float path_total_shaded)
+{
+	if(kernel_data.film.pass_denoising_data == 0)
+		return;
+
+	buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
+
+	path_total = ensure_finite(path_total);
+	path_total_shaded = ensure_finite(path_total_shaded);
+
+	kernel_write_pass_float(buffer, path_total);
+	kernel_write_pass_float(buffer+1, path_total_shaded);
+
+	float value = path_total_shaded / max(path_total, 1e-7f);
+	kernel_write_pass_float(buffer+2, value*value);
+}
+#endif /* __DENOISING_FEATURES__ */
+
+ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
+                                                        ShaderData *sd,
+                                                        ccl_addr_space PathState *state,
+                                                        PathRadiance *L)
+{
+#ifdef __DENOISING_FEATURES__
+	if(state->denoising_feature_weight == 0.0f) {
+		return;
+	}
+
+	L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
+
+	/* Skip implicitly transparent surfaces. */
+	if(sd->flag & SD_HAS_ONLY_VOLUME) {
+		return;
+	}
+
+	float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+	float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
+	float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
+
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+			continue;
+
+		/* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
+		normal += sc->N * sc->sample_weight;
+		sum_weight += sc->sample_weight;
+		if(!bsdf_is_specular_like(sc)) {
+			albedo += sc->weight;
+			sum_nonspecular_weight += sc->sample_weight;
+		}
+	}
+
+	/* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
+	if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) {
+		if(sum_weight != 0.0f) {
+			normal /= sum_weight;
+		}
+		L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
+		L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
+
+		state->denoising_feature_weight = 0.0f;
+	}
+#else
+	(void) kg;
+	(void) sd;
+	(void) state;
+	(void) L;
+#endif  /* __DENOISING_FEATURES__ */
+}
+
+#ifdef __KERNEL_DEBUG__
+ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
+                                                 ccl_global float *buffer,
+                                                 PathRadiance *L)
+{
+	int flag = kernel_data.film.pass_flag;
+	if(flag & PASS_BVH_TRAVERSED_NODES) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes,
+		                        L->debug_data.num_bvh_traversed_nodes);
+	}
+	if(flag & PASS_BVH_TRAVERSED_INSTANCES) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances,
+		                        L->debug_data.num_bvh_traversed_instances);
+	}
+	if(flag & PASS_BVH_INTERSECTIONS) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections,
+		                        L->debug_data.num_bvh_intersections);
+	}
+	if(flag & PASS_RAY_BOUNCES) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces,
+		                        L->debug_data.num_ray_bounces);
+	}
+}
+#endif /* __KERNEL_DEBUG__ */
+
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
-	ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
+	ShaderData *sd, ccl_addr_space PathState *state, float3 throughput)
 {
 #ifdef __PASSES__
 	int path_flag = state->flag;
@@ -75,38 +204,37 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		return;
 	
 	if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
-		if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) ||
+		if(!(sd->flag & SD_TRANSPARENT) ||
 		   kernel_data.film.pass_alpha_threshold == 0.0f ||
 		   average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold)
 		{
-
-			if(sample == 0) {
+			if(state->sample == 0) {
 				if(flag & PASS_DEPTH) {
-					float depth = camera_distance(kg, ccl_fetch(sd, P));
-					kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth);
+					float depth = camera_distance(kg, sd->P);
+					kernel_write_pass_float(buffer + kernel_data.film.pass_depth, depth);
 				}
 				if(flag & PASS_OBJECT_ID) {
-					float id = object_pass_id(kg, ccl_fetch(sd, object));
-					kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id);
+					float id = object_pass_id(kg, sd->object);
+					kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, id);
 				}
 				if(flag & PASS_MATERIAL_ID) {
 					float id = shader_pass_id(kg, sd);
-					kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, sample, id);
+					kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, id);
 				}
 			}
 
 			if(flag & PASS_NORMAL) {
-				float3 normal = ccl_fetch(sd, N);
-				kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal);
+				float3 normal = shader_bsdf_average_normal(kg, sd);
+				kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, normal);
 			}
 			if(flag & PASS_UV) {
 				float3 uv = primitive_uv(kg, sd);
-				kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, sample, uv);
+				kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, uv);
 			}
 			if(flag & PASS_MOTION) {
 				float4 speed = primitive_motion_vector(kg, sd);
-				kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, sample, speed);
-				kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, sample, 1.0f);
+				kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, speed);
+				kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, 1.0f);
 			}
 
 			state->flag |= PATH_RAY_SINGLE_PASS_DONE;
@@ -127,7 +255,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		float mist_start = kernel_data.film.mist_start;
 		float mist_inv_depth = kernel_data.film.mist_inv_depth;
 
-		float depth = camera_distance(kg, ccl_fetch(sd, P));
+		float depth = camera_distance(kg, sd->P);
 		float mist = saturate((depth - mist_start)*mist_inv_depth);
 
 		/* falloff */
@@ -149,7 +277,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 #endif
 }
 
-ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, int sample)
+ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L)
 {
 #ifdef __PASSES__
 	int flag = kernel_data.film.pass_flag;
@@ -158,44 +286,103 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f
 		return;
 	
 	if(flag & PASS_DIFFUSE_INDIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, sample, L->indirect_diffuse);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, L->indirect_diffuse);
 	if(flag & PASS_GLOSSY_INDIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, sample, L->indirect_glossy);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, L->indirect_glossy);
 	if(flag & PASS_TRANSMISSION_INDIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect, sample, L->indirect_transmission);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect, L->indirect_transmission);
 	if(flag & PASS_SUBSURFACE_INDIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_indirect, sample, L->indirect_subsurface);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_indirect, L->indirect_subsurface);
 	if(flag & PASS_DIFFUSE_DIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, sample, L->direct_diffuse);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, L->direct_diffuse);
 	if(flag & PASS_GLOSSY_DIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, sample, L->direct_glossy);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, L->direct_glossy);
 	if(flag & PASS_TRANSMISSION_DIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct, sample, L->direct_transmission);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct, L->direct_transmission);
 	if(flag & PASS_SUBSURFACE_DIRECT)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_direct, sample, L->direct_subsurface);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_direct, L->direct_subsurface);
 
 	if(flag & PASS_EMISSION)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, sample, L->emission);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, L->emission);
 	if(flag & PASS_BACKGROUND)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_background, sample, L->background);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_background, L->background);
 	if(flag & PASS_AO)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, sample, L->ao);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, L->ao);
 
 	if(flag & PASS_DIFFUSE_COLOR)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, sample, L->color_diffuse);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, L->color_diffuse);
 	if(flag & PASS_GLOSSY_COLOR)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, sample, L->color_glossy);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, L->color_glossy);
 	if(flag & PASS_TRANSMISSION_COLOR)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color, sample, L->color_transmission);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color, L->color_transmission);
 	if(flag & PASS_SUBSURFACE_COLOR)
-		kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_color, sample, L->color_subsurface);
+		kernel_write_pass_float3(buffer + kernel_data.film.pass_subsurface_color, L->color_subsurface);
 	if(flag & PASS_SHADOW) {
 		float4 shadow = L->shadow;
 		shadow.w = kernel_data.film.pass_shadow_scale;
-		kernel_write_pass_float4(buffer + kernel_data.film.pass_shadow, sample, shadow);
+		kernel_write_pass_float4(buffer + kernel_data.film.pass_shadow, shadow);
 	}
 	if(flag & PASS_MIST)
-		kernel_write_pass_float(buffer + kernel_data.film.pass_mist, sample, 1.0f - L->mist);
+		kernel_write_pass_float(buffer + kernel_data.film.pass_mist, 1.0f - L->mist);
+#endif
+}
+
+ccl_device_inline void kernel_write_result(KernelGlobals *kg,
+                                           ccl_global float *buffer,
+                                           int sample,
+                                           PathRadiance *L)
+{
+	float alpha;
+	float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha);
+
+	kernel_write_pass_float4(buffer, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
+
+	kernel_write_light_passes(kg, buffer, L);
+
+#ifdef __DENOISING_FEATURES__
+	if(kernel_data.film.pass_denoising_data) {
+#  ifdef __SHADOW_TRICKS__
+		kernel_write_denoising_shadow(kg,
+		                              buffer + kernel_data.film.pass_denoising_data,
+		                              sample,
+		                              average(L->path_total),
+		                              average(L->path_total_shaded));
+#  else
+		kernel_write_denoising_shadow(kg,
+		                              buffer + kernel_data.film.pass_denoising_data,
+		                              sample,
+		                              0.0f, 0.0f);
+#  endif
+		if(kernel_data.film.pass_denoising_clean) {
+			float3 noisy, clean;
+			path_radiance_split_denoising(kg, L, &noisy, &clean);
+			kernel_write_pass_float3_variance(
+			        buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+			        noisy);
+			kernel_write_pass_float3_unaligned(
+			        buffer + kernel_data.film.pass_denoising_clean,
+			        clean);
+		}
+		else {
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+			                                    ensure_finite3(L_sum));
+		}
+
+		kernel_write_pass_float3_variance(
+		        buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+		        L->denoising_normal);
+		kernel_write_pass_float3_variance(
+		        buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+		        L->denoising_albedo);
+		kernel_write_pass_float_variance(
+		        buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+		        L->denoising_depth);
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
+
+#ifdef __KERNEL_DEBUG__
+	kernel_write_debug_passes(kg, buffer, L);
 #endif
 }
 
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 6d89a89ed5b..652777a77a0 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -15,57 +15,344 @@
  */
 
 #ifdef __OSL__
-#  include "osl_shader.h"
+#  include "kernel/osl/osl_shader.h"
 #endif
 
-#include "kernel_random.h"
-#include "kernel_projection.h"
-#include "kernel_montecarlo.h"
-#include "kernel_differential.h"
-#include "kernel_camera.h"
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_camera.h"
 
-#include "geom/geom.h"
-#include "bvh/bvh.h"
+#include "kernel/geom/geom.h"
+#include "kernel/bvh/bvh.h"
 
-#include "kernel_accumulate.h"
-#include "kernel_shader.h"
-#include "kernel_light.h"
-#include "kernel_passes.h"
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_shader.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
 
 #ifdef __SUBSURFACE__
-#  include "kernel_subsurface.h"
+#  include "kernel/kernel_subsurface.h"
 #endif
 
 #ifdef __VOLUME__
-#  include "kernel_volume.h"
+#  include "kernel/kernel_volume.h"
 #endif
 
-#include "kernel_path_state.h"
-#include "kernel_shadow.h"
-#include "kernel_emission.h"
-#include "kernel_path_common.h"
-#include "kernel_path_surface.h"
-#include "kernel_path_volume.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shadow.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_path_common.h"
+#include "kernel/kernel_path_surface.h"
+#include "kernel/kernel_path_volume.h"
+#include "kernel/kernel_path_subsurface.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_forceinline bool kernel_path_scene_intersect(
+	KernelGlobals *kg,
+	ccl_addr_space PathState *state,
+	Ray *ray,
+	Intersection *isect,
+	PathRadiance *L)
+{
+	uint visibility = path_state_ray_visibility(kg, state);
+
+#ifdef __HAIR__
+	float difl = 0.0f, extmax = 0.0f;
+	uint lcg_state = 0;
+
+	if(kernel_data.bvh.have_curves) {
+		if((kernel_data.cam.resolution == 1) && (state->flag & PATH_RAY_CAMERA)) {
+			float3 pixdiff = ray->dD.dx + ray->dD.dy;
+			/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
+			difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+		}
+
+		extmax = kernel_data.curve.maximum_width;
+		lcg_state = lcg_state_init_addrspace(state, 0x51633e2d);
+	}
+
+	if(path_state_ao_bounce(kg, state)) {
+		visibility = PATH_RAY_SHADOW;
+		ray->t = kernel_data.background.ao_distance;
+	}
+
+	bool hit = scene_intersect(kg, *ray, visibility, isect, &lcg_state, difl, extmax);
+#else
+	bool hit = scene_intersect(kg, *ray, visibility, isect, NULL, 0.0f, 0.0f);
+#endif  /* __HAIR__ */
 
 #ifdef __KERNEL_DEBUG__
-#  include "kernel_debug.h"
-#endif
+	if(state->flag & PATH_RAY_CAMERA) {
+		L->debug_data.num_bvh_traversed_nodes += isect->num_traversed_nodes;
+		L->debug_data.num_bvh_traversed_instances += isect->num_traversed_instances;
+		L->debug_data.num_bvh_intersections += isect->num_intersections;
+	}
+	L->debug_data.num_ray_bounces++;
+#endif  /* __KERNEL_DEBUG__ */
 
-CCL_NAMESPACE_BEGIN
+	return hit;
+}
+
+ccl_device_forceinline void kernel_path_lamp_emission(
+	KernelGlobals *kg,
+	ccl_addr_space PathState *state,
+	Ray *ray,
+	float3 throughput,
+	ccl_addr_space Intersection *isect,
+	ShaderData *emission_sd,
+	PathRadiance *L)
+{
+#ifdef __LAMP_MIS__
+	if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
+		/* ray starting from previous non-transparent bounce */
+		Ray light_ray;
+
+		light_ray.P = ray->P - state->ray_t*ray->D;
+		state->ray_t += isect->t;
+		light_ray.D = ray->D;
+		light_ray.t = state->ray_t;
+		light_ray.time = ray->time;
+		light_ray.dD = ray->dD;
+		light_ray.dP = ray->dP;
+
+		/* intersect with lamp */
+		float3 emission;
+
+		if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission))
+			path_radiance_accum_emission(L, state, throughput, emission);
+	}
+#endif  /* __LAMP_MIS__ */
+}
+
+ccl_device_forceinline void kernel_path_background(
+	KernelGlobals *kg,
+	ccl_addr_space PathState *state,
+	ccl_addr_space Ray *ray,
+	float3 throughput,
+	ShaderData *emission_sd,
+	PathRadiance *L)
+{
+	/* eval background shader if nothing hit */
+	if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
+		L->transparent += average(throughput);
+
+#ifdef __PASSES__
+		if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif  /* __PASSES__ */
+			return;
+	}
+
+#ifdef __BACKGROUND__
+	/* sample background shader */
+	float3 L_background = indirect_background(kg, emission_sd, state, ray);
+	path_radiance_accum_background(L, state, throughput, L_background);
+#endif  /* __BACKGROUND__ */
+}
+
+#ifndef __SPLIT_KERNEL__
+
+#ifdef __VOLUME__
+ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(
+	KernelGlobals *kg,
+	ShaderData *sd,
+	PathState *state,
+	Ray *ray,
+	float3 *throughput,
+	ccl_addr_space Intersection *isect,
+	bool hit,
+	ShaderData *emission_sd,
+	PathRadiance *L)
+{
+	/* Sanitize volume stack. */
+	if(!hit) {
+		kernel_volume_clean_stack(kg, state->volume_stack);
+	}
+
+	if(state->volume_stack[0].shader == SHADER_NONE) {
+		return VOLUME_PATH_ATTENUATED;
+	}
+
+	/* volume attenuation, emission, scatter */
+	Ray volume_ray = *ray;
+	volume_ray.t = (hit)? isect->t: FLT_MAX;
+
+	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+
+#  ifdef __VOLUME_DECOUPLED__
+	int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
+	bool direct = (state->flag & PATH_RAY_CAMERA) != 0;
+	bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, direct, sampling_method);
+
+	if(decoupled) {
+		/* cache steps along volume for repeated sampling */
+		VolumeSegment volume_segment;
+
+		shader_setup_from_volume(kg, sd, &volume_ray);
+		kernel_volume_decoupled_record(kg, state,
+			&volume_ray, sd, &volume_segment, heterogeneous);
+
+		volume_segment.sampling_method = sampling_method;
+
+		/* emission */
+		if(volume_segment.closure_flag & SD_EMISSION)
+			path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission);
+
+		/* scattering */
+		VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+		if(volume_segment.closure_flag & SD_SCATTER) {
+			int all = kernel_data.integrator.sample_all_lights_indirect;
+
+			/* direct light sampling */
+			kernel_branched_path_volume_connect_light(kg, sd,
+				emission_sd, *throughput, state, L, all,
+				&volume_ray, &volume_segment);
+
+			/* indirect sample. if we use distance sampling and take just
+			 * one sample for direct and indirect light, we could share
+			 * this computation, but makes code a bit complex */
+			float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
+			float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
+
+			result = kernel_volume_decoupled_scatter(kg,
+				state, &volume_ray, sd, throughput,
+				rphase, rscatter, &volume_segment, NULL, true);
+		}
+
+		/* free cached steps */
+		kernel_volume_decoupled_free(kg, &volume_segment);
+
+		if(result == VOLUME_PATH_SCATTERED) {
+			if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
+				return VOLUME_PATH_SCATTERED;
+			else
+				return VOLUME_PATH_MISSED;
+		}
+		else {
+			*throughput *= volume_segment.accum_transmittance;
+		}
+	}
+	else
+#  endif  /* __VOLUME_DECOUPLED__ */
+	{
+		/* integrate along volume segment with distance sampling */
+		VolumeIntegrateResult result = kernel_volume_integrate(
+			kg, state, sd, &volume_ray, L, throughput, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+		if(result == VOLUME_PATH_SCATTERED) {
+			/* direct lighting */
+			kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
+
+			/* indirect light bounce */
+			if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
+				return VOLUME_PATH_SCATTERED;
+			else
+				return VOLUME_PATH_MISSED;
+		}
+#  endif  /* __VOLUME_SCATTER__ */
+	}
+
+	return VOLUME_PATH_ATTENUATED;
+}
+#endif  /* __VOLUME__ */
+
+#endif /* __SPLIT_KERNEL__ */
+
+ccl_device_forceinline bool kernel_path_shader_apply(
+	KernelGlobals *kg,
+	ShaderData *sd,
+	ccl_addr_space PathState *state,
+	ccl_addr_space Ray *ray,
+	float3 throughput,
+	ShaderData *emission_sd,
+	PathRadiance *L,
+	ccl_global float *buffer)
+{
+#ifdef __SHADOW_TRICKS__
+	if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
+		if(state->flag & PATH_RAY_CAMERA) {
+			state->flag |= (PATH_RAY_SHADOW_CATCHER |
+						   PATH_RAY_STORE_SHADOW_INFO);
+
+			float3 bg = make_float3(0.0f, 0.0f, 0.0f);
+			if(!kernel_data.background.transparent) {
+				bg = indirect_background(kg, emission_sd, state, ray);
+			}
+			path_radiance_accum_shadowcatcher(L, throughput, bg);
+		}
+	}
+	else if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		/* Only update transparency after shadow catcher bounce. */
+		L->shadow_transparency *=
+				average(shader_bsdf_transparency(kg, sd));
+	}
+#endif  /* __SHADOW_TRICKS__ */
+
+	/* holdout */
+#ifdef __HOLDOUT__
+	if(((sd->flag & SD_HOLDOUT) ||
+		(sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
+	   (state->flag & PATH_RAY_CAMERA))
+	{
+		if(kernel_data.background.transparent) {
+			float3 holdout_weight;
+			if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
+				holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+			}
+			else {
+				holdout_weight = shader_holdout_eval(kg, sd);
+			}
+			/* any throughput is ok, should all be identical here */
+			L->transparent += average(holdout_weight*throughput);
+		}
+
+		if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
+			return false;
+		}
+	}
+#endif  /* __HOLDOUT__ */
+
+	/* holdout mask objects do not write data passes */
+	kernel_write_data_passes(kg, buffer, L, sd, state, throughput);
+
+	/* blurring of bsdf after bounces, for rays that have a small likelihood
+	 * of following this particular path (diffuse, rough glossy) */
+	if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+		float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
+
+		if(blur_pdf < 1.0f) {
+			float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
+			shader_bsdf_blur(kg, sd, blur_roughness);
+		}
+	}
+
+#ifdef __EMISSION__
+	/* emission */
+	if(sd->flag & SD_EMISSION) {
+		float3 emission = indirect_primitive_emission(kg, sd, sd->ray_length, state->flag, state->ray_pdf);
+		path_radiance_accum_emission(L, state, throughput, emission);
+	}
+#endif  /* __EMISSION__ */
+
+	return true;
+}
 
 ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
                                         ShaderData *sd,
                                         ShaderData *emission_sd,
                                         PathRadiance *L,
-                                        PathState *state,
-                                        RNG *rng,
+                                        ccl_addr_space PathState *state,
                                         float3 throughput,
                                         float3 ao_alpha)
 {
 	/* todo: solve correlation */
 	float bsdf_u, bsdf_v;
 
-	path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+	path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
 	float ao_factor = kernel_data.background.ao_factor;
 	float3 ao_N;
@@ -75,267 +362,107 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 
 	sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-	if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+	if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 		Ray light_ray;
 		float3 ao_shadow;
 
-		light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+		light_ray.P = ray_offset(sd->P, sd->Ng);
 		light_ray.D = ao_D;
 		light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-		light_ray.time = ccl_fetch(sd, time);
-#endif  /* __OBJECT_MOTION__ */
-		light_ray.dP = ccl_fetch(sd, dP);
+		light_ray.time = sd->time;
+		light_ray.dP = sd->dP;
 		light_ray.dD = differential3_zero();
 
-		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
-			path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+		if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
+			path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
+		}
+		else {
+			path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
 		}
 	}
 }
 
+#ifndef __SPLIT_KERNEL__
+
+#if defined(__BRANCHED_PATH__) || defined(__BAKING__)
+
 ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                      ShaderData *sd,
                                      ShaderData *emission_sd,
-                                     RNG *rng,
                                      Ray *ray,
                                      float3 throughput,
-                                     int num_samples,
                                      PathState *state,
                                      PathRadiance *L)
 {
 	/* path iteration */
 	for(;;) {
-		/* intersect scene */
+		/* Find intersection with objects in scene. */
 		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, state);
-		bool hit = scene_intersect(kg,
-		                           *ray,
-		                           visibility,
-		                           &isect,
-		                           NULL,
-		                           0.0f, 0.0f);
+		bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L);
 
-#ifdef __LAMP_MIS__
-		if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
-			/* ray starting from previous non-transparent bounce */
-			Ray light_ray;
-
-			light_ray.P = ray->P - state->ray_t*ray->D;
-			state->ray_t += isect.t;
-			light_ray.D = ray->D;
-			light_ray.t = state->ray_t;
-			light_ray.time = ray->time;
-			light_ray.dD = ray->dD;
-			light_ray.dP = ray->dP;
-
-			/* intersect with lamp */
-			float3 emission;
-			if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission)) {
-				path_radiance_accum_emission(L,
-				                             throughput,
-				                             emission,
-				                             state->bounce);
-			}
-		}
-#endif  /* __LAMP_MIS__ */
+		/* Find intersection with lamps and compute emission for MIS. */
+		kernel_path_lamp_emission(kg, state, ray, throughput, &isect, emission_sd, L);
 
 #ifdef __VOLUME__
-		/* volume attenuation, emission, scatter */
-		if(state->volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = *ray;
-			volume_ray.t = (hit)? isect.t: FLT_MAX;
-
-			bool heterogeneous =
-			        volume_stack_is_heterogeneous(kg,
-			                                      state->volume_stack);
-
-#  ifdef __VOLUME_DECOUPLED__
-			int sampling_method =
-			        volume_stack_sampling_method(kg,
-			                                     state->volume_stack);
-			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method);
-
-			if(decoupled) {
-				/* cache steps along volume for repeated sampling */
-				VolumeSegment volume_segment;
-
-				shader_setup_from_volume(kg,
-				                         sd,
-				                         &volume_ray);
-				kernel_volume_decoupled_record(kg,
-				                               state,
-				                               &volume_ray,
-				                               sd,
-				                               &volume_segment,
-				                               heterogeneous);
-
-				volume_segment.sampling_method = sampling_method;
-
-				/* emission */
-				if(volume_segment.closure_flag & SD_EMISSION) {
-					path_radiance_accum_emission(L,
-					                             throughput,
-					                             volume_segment.accum_emission,
-					                             state->bounce);
-				}
-
-				/* scattering */
-				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
-
-				if(volume_segment.closure_flag & SD_SCATTER) {
-					int all = kernel_data.integrator.sample_all_lights_indirect;
-
-					/* direct light sampling */
-					kernel_branched_path_volume_connect_light(kg,
-					                                          rng,
-					                                          sd,
-					                                          emission_sd,
-					                                          throughput,
-					                                          state,
-					                                          L,
-					                                          all,
-					                                          &volume_ray,
-					                                          &volume_segment);
-
-					/* indirect sample. if we use distance sampling and take just
-					 * one sample for direct and indirect light, we could share
-					 * this computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
-
-					result = kernel_volume_decoupled_scatter(kg,
-					                                         state,
-					                                         &volume_ray,
-					                                         sd,
-					                                         &throughput,
-					                                         rphase,
-					                                         rscatter,
-					                                         &volume_segment,
-					                                         NULL,
-					                                         true);
-				}
-
-				/* free cached steps */
-				kernel_volume_decoupled_free(kg, &volume_segment);
-
-				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg,
-					                             rng,
-					                             sd,
-					                             &throughput,
-					                             state,
-					                             L,
-					                             ray))
-					{
-						continue;
-					}
-					else {
-						break;
-					}
-				}
-				else {
-					throughput *= volume_segment.accum_transmittance;
-				}
-			}
-			else
-#  endif  /* __VOLUME_DECOUPLED__ */
-			{
-				/* integrate along volume segment with distance sampling */
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, state, sd, &volume_ray, L, &throughput, rng, heterogeneous);
-
-#  ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* direct lighting */
-					kernel_path_volume_connect_light(kg,
-					                                 rng,
-					                                 sd,
-					                                 emission_sd,
-					                                 throughput,
-					                                 state,
-					                                 L);
-
-					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg,
-					                             rng,
-					                             sd,
-					                             &throughput,
-					                             state,
-					                             L,
-					                             ray))
-					{
-						continue;
-					}
-					else {
-						break;
-					}
-				}
-#  endif  /* __VOLUME_SCATTER__ */
-			}
+		/* Volume integration. */
+		VolumeIntegrateResult result = kernel_path_volume(kg,
+		                                                   sd,
+		                                                   state,
+		                                                   ray,
+		                                                   &throughput,
+		                                                   &isect,
+		                                                   hit,
+		                                                   emission_sd,
+		                                                   L);
+
+		if(result == VOLUME_PATH_SCATTERED) {
+			continue;
 		}
-#endif  /* __VOLUME__ */
+		else if(result == VOLUME_PATH_MISSED) {
+			break;
+		}
+#endif /* __VOLUME__*/
 
+		/* Shade background. */
 		if(!hit) {
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, emission_sd, state, ray);
-			path_radiance_accum_background(L,
-			                               throughput,
-			                               L_background,
-			                               state->bounce);
-#endif  /* __BACKGROUND__ */
-
+			kernel_path_background(kg, state, ray, throughput, emission_sd, L);
+			break;
+		}
+		else if(path_state_ao_bounce(kg, state)) {
 			break;
 		}
 
-		/* setup shading */
+		/* Setup and evaluate shader. */
 		shader_setup_from_ray(kg,
 		                      sd,
 		                      &isect,
 		                      ray);
-		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
-		shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT);
-#ifdef __BRANCHED_PATH__
-		shader_merge_closures(sd);
-#endif  /* __BRANCHED_PATH__ */
-
-		/* blurring of bsdf after bounces, for rays that have a small likelihood
-		 * of following this particular path (diffuse, rough glossy) */
-		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
-			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
-
-			if(blur_pdf < 1.0f) {
-				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-				shader_bsdf_blur(kg, sd, blur_roughness);
-			}
-		}
-
-#ifdef __EMISSION__
-		/* emission */
-		if(sd->flag & SD_EMISSION) {
-			float3 emission = indirect_primitive_emission(kg,
-			                                              sd,
-			                                              isect.t,
-			                                              state->flag,
-			                                              state->ray_pdf);
-			path_radiance_accum_emission(L, throughput, emission, state->bounce);
+		shader_eval_surface(kg, sd, state, state->flag);
+		shader_prepare_closures(sd, state);
+
+		/* Apply shadow catcher, holdout, emission. */
+		if(!kernel_path_shader_apply(kg,
+		                             sd,
+		                             state,
+		                             ray,
+		                             throughput,
+		                             emission_sd,
+		                             L,
+		                             NULL))
+		{
+			break;
 		}
-#endif  /* __EMISSION__ */
 
 		/* path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate */
-		float probability =
-		        path_state_terminate_probability(kg,
-		                                         state,
-		                                         throughput*num_samples);
+		float probability = path_state_continuation_probability(kg, state, throughput);
 
 		if(probability == 0.0f) {
 			break;
 		}
 		else if(probability != 1.0f) {
-			float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+			float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
 
 			if(terminate >= probability)
 				break;
@@ -343,10 +470,12 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			throughput /= probability;
 		}
 
+		kernel_update_denoising_features(kg, sd, state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
-			kernel_path_ao(kg, sd, emission_sd, L, state, rng, throughput, make_float3(0.0f, 0.0f, 0.0f));
+			kernel_path_ao(kg, sd, emission_sd, L, state, throughput, make_float3(0.0f, 0.0f, 0.0f));
 		}
 #endif  /* __AO__ */
 
@@ -354,22 +483,18 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		/* bssrdf scatter to a different location on the same object, replacing
 		 * the closures with a diffuse BSDF */
 		if(sd->flag & SD_BSSRDF) {
-			float bssrdf_probability;
-			ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+			float bssrdf_u, bssrdf_v;
+			path_state_rng_2D(kg,
+			                  state,
+			                  PRNG_BSDF_U,
+			                  &bssrdf_u, &bssrdf_v);
 
-			/* modify throughput for picking bssrdf or bsdf */
-			throughput *= bssrdf_probability;
+			const ShaderClosure *sc = shader_bssrdf_pick(sd, &throughput, &bssrdf_u);
 
 			/* do bssrdf scatter step if we picked a bssrdf closure */
 			if(sc) {
-				uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
-
-				float bssrdf_u, bssrdf_v;
-				path_state_rng_2D(kg,
-				                  rng,
-				                  state,
-				                  PRNG_BSDF_U,
-				                  &bssrdf_u, &bssrdf_v);
+				uint lcg_state = lcg_state_init(state, 0x68bc21eb);
+
 				subsurface_scatter_step(kg,
 				                        sd,
 				                        state,
@@ -382,11 +507,11 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		}
 #endif  /* __SUBSURFACE__ */
 
-#if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
+#if defined(__EMISSION__)
 		if(kernel_data.integrator.use_direct_light) {
-			int all = kernel_data.integrator.sample_all_lights_indirect;
+			int all = (kernel_data.integrator.sample_all_lights_indirect) ||
+			          (state->flag & PATH_RAY_SHADOW_CATCHER);
 			kernel_branched_path_surface_connect_light(kg,
-			                                           rng,
 			                                           sd,
 			                                           emission_sd,
 			                                           state,
@@ -395,205 +520,26 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			                                           L,
 			                                           all);
 		}
-#endif  /* defined(__EMISSION__) && defined(__BRANCHED_PATH__) */
+#endif  /* defined(__EMISSION__) */
 
-		if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray))
+		if(!kernel_path_surface_bounce(kg, sd, &throughput, state, &L->state, ray))
 			break;
 	}
 }
 
-#ifdef __SUBSURFACE__
-#  ifndef __KERNEL_CUDA__
-ccl_device
-#  else
-ccl_device_inline
-#  endif
-bool kernel_path_subsurface_scatter(
-        KernelGlobals *kg,
-        ShaderData *sd,
-        ShaderData *emission_sd,
-        PathRadiance *L,
-        PathState *state,
-        RNG *rng,
-        Ray *ray,
-        float3 *throughput,
-        SubsurfaceIndirectRays *ss_indirect)
-{
-	float bssrdf_probability;
-	ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
 
-	/* modify throughput for picking bssrdf or bsdf */
-	*throughput *= bssrdf_probability;
-
-	/* do bssrdf scatter step if we picked a bssrdf closure */
-	if(sc) {
-		/* We should never have two consecutive BSSRDF bounces,
-		 * the second one should be converted to a diffuse BSDF to
-		 * avoid this.
-		 */
-		kernel_assert(!ss_indirect->tracing);
-
-		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
-
-		SubsurfaceIntersection ss_isect;
-		float bssrdf_u, bssrdf_v;
-		path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-		int num_hits = subsurface_scatter_multi_intersect(kg,
-		                                                  &ss_isect,
-		                                                  sd,
-		                                                  sc,
-		                                                  &lcg_state,
-		                                                  bssrdf_u, bssrdf_v,
-		                                                  false);
-#  ifdef __VOLUME__
-		ss_indirect->need_update_volume_stack =
-		        kernel_data.integrator.use_volumes &&
-		        ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
-#  endif  /* __VOLUME__ */
-
-		/* compute lighting with the BSDF closure */
-		for(int hit = 0; hit < num_hits; hit++) {
-			/* NOTE: We reuse the existing ShaderData, we assume the path
-			 * integration loop stops when this function returns true.
-			 */
-			subsurface_scatter_multi_setup(kg,
-			                               &ss_isect,
-			                               hit,
-			                               sd,
-			                               state,
-			                               state->flag,
-			                               sc,
-			                               false);
-
-			PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
-			Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
-			float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
-			PathRadiance *hit_L = &ss_indirect->L[ss_indirect->num_rays];
-
-			*hit_state = *state;
-			*hit_ray = *ray;
-			*hit_tp = *throughput;
-
-			hit_state->rng_offset += PRNG_BOUNCE_NUM;
-
-			path_radiance_init(hit_L, kernel_data.film.use_light_pass);
-			hit_L->direct_throughput = L->direct_throughput;
-			path_radiance_copy_indirect(hit_L, L);
-
-			kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L);
-
-			if(kernel_path_surface_bounce(kg,
-			                              rng,
-			                              sd,
-			                              hit_tp,
-			                              hit_state,
-			                              hit_L,
-			                              hit_ray))
-			{
-#  ifdef __LAMP_MIS__
-				hit_state->ray_t = 0.0f;
-#  endif  /* __LAMP_MIS__ */
-
-#  ifdef __VOLUME__
-				if(ss_indirect->need_update_volume_stack) {
-					Ray volume_ray = *ray;
-					/* Setup ray from previous surface point to the new one. */
-					volume_ray.D = normalize_len(hit_ray->P - volume_ray.P,
-					                             &volume_ray.t);
-
-					kernel_volume_stack_update_for_subsurface(
-					    kg,
-					    emission_sd,
-					    &volume_ray,
-					    hit_state->volume_stack);
-				}
-#  endif  /* __VOLUME__ */
-				path_radiance_reset_indirect(L);
-				ss_indirect->num_rays++;
-			}
-			else {
-				path_radiance_accum_sample(L, hit_L, 1);
-			}
-		}
-		return true;
-	}
-	return false;
-}
-
-ccl_device_inline void kernel_path_subsurface_init_indirect(
-        SubsurfaceIndirectRays *ss_indirect)
-{
-	ss_indirect->tracing = false;
-	ss_indirect->num_rays = 0;
-}
-
-ccl_device void kernel_path_subsurface_accum_indirect(
-        SubsurfaceIndirectRays *ss_indirect,
-        PathRadiance *L)
+ccl_device_forceinline void kernel_path_integrate(
+	KernelGlobals *kg,
+	PathState *state,
+	float3 throughput,
+	Ray *ray,
+	PathRadiance *L,
+	ccl_global float *buffer,
+	ShaderData *emission_sd)
 {
-	if(ss_indirect->tracing) {
-		path_radiance_sum_indirect(L);
-		path_radiance_accum_sample(&ss_indirect->direct_L, L, 1);
-		if(ss_indirect->num_rays == 0) {
-			*L = ss_indirect->direct_L;
-		}
-	}
-}
-
-ccl_device void kernel_path_subsurface_setup_indirect(
-        KernelGlobals *kg,
-        SubsurfaceIndirectRays *ss_indirect,
-        PathState *state,
-        Ray *ray,
-        PathRadiance *L,
-        float3 *throughput)
-{
-	if(!ss_indirect->tracing) {
-		ss_indirect->direct_L = *L;
-	}
-	ss_indirect->tracing = true;
-
-	/* Setup state, ray and throughput for indirect SSS rays. */
-	ss_indirect->num_rays--;
-
-	Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays];
-	PathRadiance *indirect_L = &ss_indirect->L[ss_indirect->num_rays];
-
-	*state = ss_indirect->state[ss_indirect->num_rays];
-	*ray = *indirect_ray;
-	*L = *indirect_L;
-	*throughput = ss_indirect->throughputs[ss_indirect->num_rays];
-
-	state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
-}
-
-#endif  /* __SUBSURFACE__ */
-
-ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
-                                               RNG *rng,
-                                               int sample,
-                                               Ray ray,
-                                               ccl_global float *buffer)
-{
-	/* initialize */
-	PathRadiance L;
-	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-	float L_transparent = 0.0f;
-
-	path_radiance_init(&L, kernel_data.film.use_light_pass);
-
-	/* shader data memory used for both volumes and surfaces, saves stack space */
+	/* Shader data memory used for both volumes and surfaces, saves stack space. */
 	ShaderData sd;
-	/* shader data used by emission, shadows, volume stacks */
-	ShaderData emission_sd;
-
-	PathState state;
-	path_state_init(kg, &emission_sd, &state, rng, sample, &ray);
-
-#ifdef __KERNEL_DEBUG__
-	DebugData debug_data;
-	debug_data_init(&debug_data);
-#endif  /* __KERNEL_DEBUG__ */
 
 #ifdef __SUBSURFACE__
 	SubsurfaceIndirectRays ss_indirect;
@@ -604,231 +550,82 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 
 	/* path iteration */
 	for(;;) {
-		/* intersect scene */
+		/* Find intersection with objects in scene. */
 		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
-		float difl = 0.0f, extmax = 0.0f;
-		uint lcg_state = 0;
-
-		if(kernel_data.bvh.have_curves) {
-			if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {	
-				float3 pixdiff = ray.dD.dx + ray.dD.dy;
-				/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-				difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
-			}
-
-			extmax = kernel_data.curve.maximum_width;
-			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
-		}
+		bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L);
 
-		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
-		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif  /* __HAIR__ */
-
-#ifdef __KERNEL_DEBUG__
-		if(state.flag & PATH_RAY_CAMERA) {
-			debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
-			debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
-		}
-		debug_data.num_ray_bounces++;
-#endif  /* __KERNEL_DEBUG__ */
-
-#ifdef __LAMP_MIS__
-		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
-			/* ray starting from previous non-transparent bounce */
-			Ray light_ray;
-
-			light_ray.P = ray.P - state.ray_t*ray.D;
-			state.ray_t += isect.t;
-			light_ray.D = ray.D;
-			light_ray.t = state.ray_t;
-			light_ray.time = ray.time;
-			light_ray.dD = ray.dD;
-			light_ray.dP = ray.dP;
-
-			/* intersect with lamp */
-			float3 emission;
-
-			if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
-				path_radiance_accum_emission(&L, throughput, emission, state.bounce);
-		}
-#endif  /* __LAMP_MIS__ */
+		/* Find intersection with lamps and compute emission for MIS. */
+		kernel_path_lamp_emission(kg, state, ray, throughput, &isect, emission_sd, L);
 
 #ifdef __VOLUME__
-		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = ray;
-			volume_ray.t = (hit)? isect.t: FLT_MAX;
-
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-
-#  ifdef __VOLUME_DECOUPLED__
-			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
-			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
-
-			if(decoupled) {
-				/* cache steps along volume for repeated sampling */
-				VolumeSegment volume_segment;
-
-				shader_setup_from_volume(kg, &sd, &volume_ray);
-				kernel_volume_decoupled_record(kg, &state,
-					&volume_ray, &sd, &volume_segment, heterogeneous);
-
-				volume_segment.sampling_method = sampling_method;
-
-				/* emission */
-				if(volume_segment.closure_flag & SD_EMISSION)
-					path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
-
-				/* scattering */
-				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
-
-				if(volume_segment.closure_flag & SD_SCATTER) {
-					int all = false;
-
-					/* direct light sampling */
-					kernel_branched_path_volume_connect_light(kg, rng, &sd,
-						&emission_sd, throughput, &state, &L, all,
-						&volume_ray, &volume_segment);
-
-					/* indirect sample. if we use distance sampling and take just
-					 * one sample for direct and indirect light, we could share
-					 * this computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
-
-					result = kernel_volume_decoupled_scatter(kg,
-						&state, &volume_ray, &sd, &throughput,
-						rphase, rscatter, &volume_segment, NULL, true);
-				}
-
-				/* free cached steps */
-				kernel_volume_decoupled_free(kg, &volume_segment);
-
-				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
-						continue;
-					else
-						break;
-				}
-				else {
-					throughput *= volume_segment.accum_transmittance;
-				}
-			}
-			else
-#  endif  /* __VOLUME_DECOUPLED__ */
-			{
-				/* integrate along volume segment with distance sampling */
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);
-
-#  ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* direct lighting */
-					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
-
-					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
-						continue;
-					else
-						break;
-				}
-#  endif  /* __VOLUME_SCATTER__ */
-			}
+		/* Volume integration. */
+		VolumeIntegrateResult result = kernel_path_volume(kg,
+		                                                   &sd,
+		                                                   state,
+		                                                   ray,
+		                                                   &throughput,
+		                                                   &isect,
+		                                                   hit,
+		                                                   emission_sd,
+		                                                   L);
+
+		if(result == VOLUME_PATH_SCATTERED) {
+			continue;
 		}
-#endif  /* __VOLUME__ */
-
-		if(!hit) {
-			/* eval background shader if nothing hit */
-			if(kernel_data.background.transparent && (state.flag & PATH_RAY_CAMERA)) {
-				L_transparent += average(throughput);
-
-#ifdef __PASSES__
-				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif  /* __PASSES__ */
-					break;
-			}
-
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
-#endif  /* __BACKGROUND__ */
-
+		else if(result == VOLUME_PATH_MISSED) {
 			break;
 		}
+#endif /* __VOLUME__*/
 
-		/* setup shading */
-		shader_setup_from_ray(kg, &sd, &isect, &ray);
-		float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
-		shader_eval_surface(kg, &sd, rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
-
-		/* holdout */
-#ifdef __HOLDOUT__
-		if((sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state.flag & PATH_RAY_CAMERA)) {
-			if(kernel_data.background.transparent) {
-				float3 holdout_weight;
-				
-				if(sd.flag & SD_HOLDOUT_MASK)
-					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-				else
-					holdout_weight = shader_holdout_eval(kg, &sd);
-
-				/* any throughput is ok, should all be identical here */
-				L_transparent += average(holdout_weight*throughput);
-			}
-
-			if(sd.flag & SD_HOLDOUT_MASK)
-				break;
+		/* Shade background. */
+		if(!hit) {
+			kernel_path_background(kg, state, ray, throughput, emission_sd, L);
+			break;
 		}
-#endif  /* __HOLDOUT__ */
-
-		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
-
-		/* blurring of bsdf after bounces, for rays that have a small likelihood
-		 * of following this particular path (diffuse, rough glossy) */
-		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
-			float blur_pdf = kernel_data.integrator.filter_glossy*state.min_ray_pdf;
-
-			if(blur_pdf < 1.0f) {
-				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-				shader_bsdf_blur(kg, &sd, blur_roughness);
-			}
+		else if(path_state_ao_bounce(kg, state)) {
+			break;
 		}
 
-#ifdef __EMISSION__
-		/* emission */
-		if(sd.flag & SD_EMISSION) {
-			/* todo: is isect.t wrong here for transparent surfaces? */
-			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+		/* Setup and evaluate shader. */
+		shader_setup_from_ray(kg, &sd, &isect, ray);
+		shader_eval_surface(kg, &sd, state, state->flag);
+		shader_prepare_closures(&sd, state);
+
+		/* Apply shadow catcher, holdout, emission. */
+		if(!kernel_path_shader_apply(kg,
+		                             &sd,
+		                             state,
+		                             ray,
+		                             throughput,
+		                             emission_sd,
+		                             L,
+		                             buffer))
+		{
+			break;
 		}
-#endif  /* __EMISSION__ */
 
 		/* path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate */
-		float probability = path_state_terminate_probability(kg, &state, throughput);
+		float probability = path_state_continuation_probability(kg, state, throughput);
 
 		if(probability == 0.0f) {
 			break;
 		}
 		else if(probability != 1.0f) {
-			float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+			float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
 			if(terminate >= probability)
 				break;
 
 			throughput /= probability;
 		}
 
+		kernel_update_denoising_features(kg, &sd, state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
+			kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd));
 		}
 #endif  /* __AO__ */
 
@@ -838,11 +635,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		if(sd.flag & SD_BSSRDF) {
 			if(kernel_path_subsurface_scatter(kg,
 			                                  &sd,
-			                                  &emission_sd,
-			                                  &L,
-			                                  &state,
-			                                  rng,
-			                                  &ray,
+			                                  emission_sd,
+			                                  L,
+			                                  state,
+			                                  ray,
 			                                  &throughput,
 			                                  &ss_indirect))
 			{
@@ -852,25 +648,23 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #endif  /* __SUBSURFACE__ */
 
 		/* direct lighting */
-		kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+		kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L);
 
 		/* compute direct lighting and next bounce */
-		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+		if(!kernel_path_surface_bounce(kg, &sd, &throughput, state, &L->state, ray))
 			break;
 	}
 
 #ifdef __SUBSURFACE__
-		kernel_path_subsurface_accum_indirect(&ss_indirect, &L);
-
 		/* Trace indirect subsurface rays by restarting the loop. this uses less
 		 * stack memory than invoking kernel_path_indirect.
 		 */
 		if(ss_indirect.num_rays) {
 			kernel_path_subsurface_setup_indirect(kg,
 			                                      &ss_indirect,
-			                                      &state,
-			                                      &ray,
-			                                      &L,
+			                                      state,
+			                                      ray,
+			                                      L,
 			                                      &throughput);
 		}
 		else {
@@ -878,48 +672,51 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		}
 	}
 #endif  /* __SUBSURFACE__ */
-
-	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
-
-	kernel_write_light_passes(kg, buffer, &L, sample);
-
-#ifdef __KERNEL_DEBUG__
-	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
-#endif  /* __KERNEL_DEBUG__ */
-
-	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
 }
 
 ccl_device void kernel_path_trace(KernelGlobals *kg,
-	ccl_global float *buffer, ccl_global uint *rng_state,
+	ccl_global float *buffer,
 	int sample, int x, int y, int offset, int stride)
 {
 	/* buffer offset */
 	int index = offset + x + y*stride;
 	int pass_stride = kernel_data.film.pass_stride;
 
-	rng_state += index;
 	buffer += index*pass_stride;
 
-	/* initialize random numbers and ray */
-	RNG rng;
+	/* Initialize random numbers and sample ray. */
+	uint rng_hash;
 	Ray ray;
 
-	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
+	kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
 
-	/* integrate */
-	float4 L;
+	if(ray.t == 0.0f) {
+		return;
+	}
 
-	if(ray.t != 0.0f)
-		L = kernel_path_integrate(kg, &rng, sample, ray, buffer);
-	else
-		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	/* Initialize state. */
+	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 
-	/* accumulate result in output buffer */
-	kernel_write_pass_float4(buffer, sample, L);
+	PathRadiance L;
+	path_radiance_init(&L, kernel_data.film.use_light_pass);
 
-	path_rng_end(kg, rng_state, rng);
+	ShaderData emission_sd;
+	PathState state;
+	path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray);
+
+	/* Integrate. */
+	kernel_path_integrate(kg,
+	                      &state,
+	                      throughput,
+	                      &ray,
+	                      &L,
+	                      buffer,
+	                      &emission_sd);
+
+	kernel_write_result(kg, buffer, sample, &L);
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index c84727ace99..42df7e85b41 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -22,8 +22,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
                                                ShaderData *sd,
                                                ShaderData *emission_sd,
                                                PathRadiance *L,
-                                               PathState *state,
-                                               RNG *rng,
+                                               ccl_addr_space PathState *state,
                                                float3 throughput)
 {
 	int num_samples = kernel_data.integrator.ao_samples;
@@ -35,46 +34,225 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 
 	for(int j = 0; j < num_samples; j++) {
 		float bsdf_u, bsdf_v;
-		path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+		path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
 		float3 ao_D;
 		float ao_pdf;
 
 		sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-		if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+		if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 			Ray light_ray;
 			float3 ao_shadow;
 
-			light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+			light_ray.P = ray_offset(sd->P, sd->Ng);
 			light_ray.D = ao_D;
 			light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-			light_ray.time = ccl_fetch(sd, time);
-#endif  /* __OBJECT_MOTION__ */
-			light_ray.dP = ccl_fetch(sd, dP);
+			light_ray.time = sd->time;
+			light_ray.dP = sd->dP;
 			light_ray.dD = differential3_zero();
 
-			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow))
-				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+			if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
+				path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
+			}
+			else {
+				path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf);
+			}
 		}
 	}
 }
 
+#ifndef __SPLIT_KERNEL__
+
+#ifdef __VOLUME__
+ccl_device_forceinline void kernel_branched_path_volume(
+	KernelGlobals *kg,
+	ShaderData *sd,
+	PathState *state,
+	Ray *ray,
+	float3 *throughput,
+	ccl_addr_space Intersection *isect,
+	bool hit,
+	ShaderData *indirect_sd,
+	ShaderData *emission_sd,
+	PathRadiance *L)
+{
+	/* Sanitize volume stack. */
+	if(!hit) {
+		kernel_volume_clean_stack(kg, state->volume_stack);
+	}
+
+	if(state->volume_stack[0].shader == SHADER_NONE) {
+		return;
+	}
+
+	/* volume attenuation, emission, scatter */
+	Ray volume_ray = *ray;
+	volume_ray.t = (hit)? isect->t: FLT_MAX;
+
+	bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+
+#  ifdef __VOLUME_DECOUPLED__
+	/* decoupled ray marching only supported on CPU */
+	if(kernel_data.integrator.volume_decoupled) {
+		/* cache steps along volume for repeated sampling */
+		VolumeSegment volume_segment;
+
+		shader_setup_from_volume(kg, sd, &volume_ray);
+		kernel_volume_decoupled_record(kg, state,
+			&volume_ray, sd, &volume_segment, heterogeneous);
+
+		/* direct light sampling */
+		if(volume_segment.closure_flag & SD_SCATTER) {
+			volume_segment.sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
+
+			int all = kernel_data.integrator.sample_all_lights_direct;
+
+			kernel_branched_path_volume_connect_light(kg, sd,
+				emission_sd, *throughput, state, L, all,
+				&volume_ray, &volume_segment);
+
+			/* indirect light sampling */
+			int num_samples = kernel_data.integrator.volume_samples;
+			float num_samples_inv = 1.0f/num_samples;
+
+			for(int j = 0; j < num_samples; j++) {
+				PathState ps = *state;
+				Ray pray = *ray;
+				float3 tp = *throughput;
+
+				/* branch RNG state */
+				path_state_branch(&ps, j, num_samples);
+
+				/* scatter sample. if we use distance sampling and take just one
+				 * sample for direct and indirect light, we could share this
+				 * computation, but makes code a bit complex */
+				float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL);
+				float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE);
+
+				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
+					&ps, &pray, sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
+
+				if(result == VOLUME_PATH_SCATTERED &&
+				   kernel_path_volume_bounce(kg,
+				                             sd,
+				                             &tp,
+				                             &ps,
+				                             &L->state,
+				                             &pray))
+				{
+					kernel_path_indirect(kg,
+					                     indirect_sd,
+					                     emission_sd,
+					                     &pray,
+					                     tp*num_samples_inv,
+					                     &ps,
+					                     L);
+
+					/* for render passes, sum and reset indirect light pass variables
+					 * for the next samples */
+					path_radiance_sum_indirect(L);
+					path_radiance_reset_indirect(L);
+				}
+			}
+		}
+
+		/* emission and transmittance */
+		if(volume_segment.closure_flag & SD_EMISSION)
+			path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission);
+		*throughput *= volume_segment.accum_transmittance;
+
+		/* free cached steps */
+		kernel_volume_decoupled_free(kg, &volume_segment);
+	}
+	else
+#  endif  /* __VOLUME_DECOUPLED__ */
+	{
+		/* GPU: no decoupled ray marching, scatter probalistically */
+		int num_samples = kernel_data.integrator.volume_samples;
+		float num_samples_inv = 1.0f/num_samples;
+
+		/* todo: we should cache the shader evaluations from stepping
+		 * through the volume, for now we redo them multiple times */
+
+		for(int j = 0; j < num_samples; j++) {
+			PathState ps = *state;
+			Ray pray = *ray;
+			float3 tp = (*throughput) * num_samples_inv;
+
+			/* branch RNG state */
+			path_state_branch(&ps, j, num_samples);
+
+			VolumeIntegrateResult result = kernel_volume_integrate(
+				kg, &ps, sd, &volume_ray, L, &tp, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+			if(result == VOLUME_PATH_SCATTERED) {
+				/* todo: support equiangular, MIS and all light sampling.
+				 * alternatively get decoupled ray marching working on the GPU */
+				kernel_path_volume_connect_light(kg, sd, emission_sd, tp, state, L);
+
+				if(kernel_path_volume_bounce(kg,
+				                             sd,
+				                             &tp,
+				                             &ps,
+				                             &L->state,
+				                             &pray))
+				{
+					kernel_path_indirect(kg,
+					                     indirect_sd,
+					                     emission_sd,
+					                     &pray,
+					                     tp,
+					                     &ps,
+					                     L);
+
+					/* for render passes, sum and reset indirect light pass variables
+					 * for the next samples */
+					path_radiance_sum_indirect(L);
+					path_radiance_reset_indirect(L);
+				}
+			}
+# endif  /* __VOLUME_SCATTER__ */
+		}
+
+		/* todo: avoid this calculation using decoupled ray marching */
+		kernel_volume_shadow(kg, emission_sd, state, &volume_ray, throughput);
+	}
+}
+#endif  /* __VOLUME__ */
 
 /* bounce off surface and integrate indirect light */
 ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-	RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
+	ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
 	float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+	float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+	if(state->denoising_feature_weight > 0.0f) {
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			/* transparency is not handled here, but in outer loop */
+			if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+				continue;
+			}
+
+			sum_sample_weight += sc->sample_weight;
+		}
+	}
+	else {
+		sum_sample_weight = 1.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 
-		if(!CLOSURE_IS_BSDF(sc->type))
-			continue;
 		/* transparency is not handled here, but in outer loop */
-		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+		if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
 			continue;
+		}
 
 		int num_samples;
 
@@ -90,34 +268,38 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 		num_samples = ceil_to_int(num_samples_adjust*num_samples);
 
 		float num_samples_inv = num_samples_adjust/num_samples;
-		RNG bsdf_rng = cmj_hash(*rng, i);
 
 		for(int j = 0; j < num_samples; j++) {
 			PathState ps = *state;
 			float3 tp = throughput;
 			Ray bsdf_ray;
+#ifdef __SHADOW_TRICKS__
+			float shadow_transparency = L->shadow_transparency;
+#endif
+
+			ps.rng_hash = cmj_hash(state->rng_hash, i);
 
 			if(!kernel_branched_path_surface_bounce(kg,
-			                                        &bsdf_rng,
 			                                        sd,
 			                                        sc,
 			                                        j,
 			                                        num_samples,
 			                                        &tp,
 			                                        &ps,
-			                                        L,
-			                                        &bsdf_ray))
+			                                        &L->state,
+			                                        &bsdf_ray,
+			                                        sum_sample_weight))
 			{
 				continue;
 			}
 
+			ps.rng_hash = state->rng_hash;
+
 			kernel_path_indirect(kg,
 			                     indirect_sd,
 			                     emission_sd,
-			                     rng,
 			                     &bsdf_ray,
 			                     tp*num_samples_inv,
-			                     num_samples,
 			                     &ps,
 			                     L);
 
@@ -125,6 +307,10 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 			 * for the next samples */
 			path_radiance_sum_indirect(L);
 			path_radiance_reset_indirect(L);
+
+#ifdef __SHADOW_TRICKS__
+			L->shadow_transparency = shadow_transparency;
+#endif
 		}
 	}
 }
@@ -136,28 +322,27 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
                                                         ShaderData *emission_sd,
                                                         PathRadiance *L,
                                                         PathState *state,
-                                                        RNG *rng,
                                                         Ray *ray,
                                                         float3 throughput)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(!CLOSURE_IS_BSSRDF(sc->type))
 			continue;
 
 		/* set up random number generator */
-		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
+		uint lcg_state = lcg_state_init(state, 0x68bc21eb);
 		int num_samples = kernel_data.integrator.subsurface_samples;
 		float num_samples_inv = 1.0f/num_samples;
-		RNG bssrdf_rng = cmj_hash(*rng, i);
+		uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i);
 
 		/* do subsurface scatter step with copy of shader data, this will
 		 * replace the BSSRDF with a diffuse BSDF closure */
 		for(int j = 0; j < num_samples; j++) {
 			SubsurfaceIntersection ss_isect;
 			float bssrdf_u, bssrdf_v;
-			path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+			path_branched_rng_2D(kg, bssrdf_rng_hash, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
 			int num_hits = subsurface_scatter_multi_intersect(kg,
 			                                                  &ss_isect,
 			                                                  sd,
@@ -167,8 +352,9 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 			                                                  true);
 #ifdef __VOLUME__
 			Ray volume_ray = *ray;
-			bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-			                                ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
+			bool need_update_volume_stack =
+			        kernel_data.integrator.use_volumes &&
+			        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
 #endif  /* __VOLUME__ */
 
 			/* compute lighting with the BSDF closure */
@@ -205,10 +391,10 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 #ifdef __EMISSION__
 				/* direct light */
 				if(kernel_data.integrator.use_direct_light) {
-					int all = kernel_data.integrator.sample_all_lights_direct;
+					int all = (kernel_data.integrator.sample_all_lights_direct) ||
+					          (state->flag & PATH_RAY_SHADOW_CATCHER);
 					kernel_branched_path_surface_connect_light(
 					        kg,
-					        rng,
 					        &bssrdf_sd,
 					        emission_sd,
 					        &hit_state,
@@ -222,7 +408,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 				/* indirect light */
 				kernel_branched_path_surface_indirect_light(
 				        kg,
-				        rng,
 				        &bssrdf_sd,
 				        indirect_sd,
 				        emission_sd,
@@ -236,14 +421,17 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 }
 #endif  /* __SUBSURFACE__ */
 
-ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
+ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
+                                               uint rng_hash,
+                                               int sample,
+                                               Ray ray,
+                                               ccl_global float *buffer,
+                                               PathRadiance *L)
 {
 	/* initialize */
-	PathRadiance L;
 	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-	float L_transparent = 0.0f;
 
-	path_radiance_init(&L, kernel_data.film.use_light_pass);
+	path_radiance_init(L, kernel_data.film.use_light_pass);
 
 	/* shader data memory used for both volumes and surfaces, saves stack space */
 	ShaderData sd;
@@ -251,264 +439,67 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 	ShaderData emission_sd, indirect_sd;
 
 	PathState state;
-	path_state_init(kg, &emission_sd, &state, rng, sample, &ray);
-
-#ifdef __KERNEL_DEBUG__
-	DebugData debug_data;
-	debug_data_init(&debug_data);
-#endif  /* __KERNEL_DEBUG__ */
+	path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray);
 
 	/* Main Loop
 	 * Here we only handle transparency intersections from the camera ray.
 	 * Indirect bounces are handled in kernel_branched_path_surface_indirect_light().
 	 */
 	for(;;) {
-		/* intersect scene */
+		/* Find intersection with objects in scene. */
 		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
-		float difl = 0.0f, extmax = 0.0f;
-		uint lcg_state = 0;
-
-		if(kernel_data.bvh.have_curves) {
-			if(kernel_data.cam.resolution == 1) {
-				float3 pixdiff = ray.dD.dx + ray.dD.dy;
-				/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-				difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
-			}
-
-			extmax = kernel_data.curve.maximum_width;
-			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
-		}
-
-		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
-		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif  /* __HAIR__ */
-
-#ifdef __KERNEL_DEBUG__
-		debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
-		debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
-		debug_data.num_ray_bounces++;
-#endif  /* __KERNEL_DEBUG__ */
+		bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L);
 
 #ifdef __VOLUME__
-		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = ray;
-			volume_ray.t = (hit)? isect.t: FLT_MAX;
-			
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-
-#ifdef __VOLUME_DECOUPLED__
-			/* decoupled ray marching only supported on CPU */
-
-			/* cache steps along volume for repeated sampling */
-			VolumeSegment volume_segment;
-
-			shader_setup_from_volume(kg, &sd, &volume_ray);
-			kernel_volume_decoupled_record(kg, &state,
-				&volume_ray, &sd, &volume_segment, heterogeneous);
-
-			/* direct light sampling */
-			if(volume_segment.closure_flag & SD_SCATTER) {
-				volume_segment.sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
-
-				int all = kernel_data.integrator.sample_all_lights_direct;
-
-				kernel_branched_path_volume_connect_light(kg, rng, &sd,
-					&emission_sd, throughput, &state, &L, all,
-					&volume_ray, &volume_segment);
-
-				/* indirect light sampling */
-				int num_samples = kernel_data.integrator.volume_samples;
-				float num_samples_inv = 1.0f/num_samples;
-
-				for(int j = 0; j < num_samples; j++) {
-					/* workaround to fix correlation bug in T38710, can find better solution
-					 * in random number generator later, for now this is done here to not impact
-					 * performance of rendering without volumes */
-					RNG tmp_rng = cmj_hash(*rng, state.rng_offset);
-
-					PathState ps = state;
-					Ray pray = ray;
-					float3 tp = throughput;
-
-					/* branch RNG state */
-					path_state_branch(&ps, j, num_samples);
-
-					/* scatter sample. if we use distance sampling and take just one
-					 * sample for direct and indirect light, we could share this
-					 * computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);
-
-					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-						&ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
-
-					(void)result;
-					kernel_assert(result == VOLUME_PATH_SCATTERED);
-
-					if(kernel_path_volume_bounce(kg,
-					                             rng,
-					                             &sd,
-					                             &tp,
-					                             &ps,
-					                             &L,
-					                             &pray))
-					{
-						kernel_path_indirect(kg,
-						                     &indirect_sd,
-						                     &emission_sd,
-						                     rng,
-						                     &pray,
-						                     tp*num_samples_inv,
-						                     num_samples,
-						                     &ps,
-						                     &L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
-					}
-				}
-			}
-
-			/* emission and transmittance */
-			if(volume_segment.closure_flag & SD_EMISSION)
-				path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
-			throughput *= volume_segment.accum_transmittance;
-
-			/* free cached steps */
-			kernel_volume_decoupled_free(kg, &volume_segment);
-#else
-			/* GPU: no decoupled ray marching, scatter probalistically */
-			int num_samples = kernel_data.integrator.volume_samples;
-			float num_samples_inv = 1.0f/num_samples;
-
-			/* todo: we should cache the shader evaluations from stepping
-			 * through the volume, for now we redo them multiple times */
-
-			for(int j = 0; j < num_samples; j++) {
-				PathState ps = state;
-				Ray pray = ray;
-				float3 tp = throughput * num_samples_inv;
-
-				/* branch RNG state */
-				path_state_branch(&ps, j, num_samples);
-
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);
-
-#ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* todo: support equiangular, MIS and all light sampling.
-					 * alternatively get decoupled ray marching working on the GPU */
-					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);
-
-					if(kernel_path_volume_bounce(kg,
-					                             rng,
-					                             &sd,
-					                             &tp,
-					                             &ps,
-					                             &L,
-					                             &pray))
-					{
-						kernel_path_indirect(kg,
-						                     &indirect_sd,
-						                     &emission_sd,
-						                     rng,
-						                     &pray,
-						                     tp,
-						                     num_samples,
-						                     &ps,
-						                     &L);
-
-						/* for render passes, sum and reset indirect light pass variables
-						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
-					}
-				}
-#endif  /* __VOLUME_SCATTER__ */
-			}
-
-			/* todo: avoid this calculation using decoupled ray marching */
-			kernel_volume_shadow(kg, &emission_sd, &state, &volume_ray, &throughput);
-#endif  /* __VOLUME_DECOUPLED__ */
-		}
+		/* Volume integration. */
+		kernel_branched_path_volume(kg,
+		                            &sd,
+		                            &state,
+		                            &ray,
+		                            &throughput,
+		                            &isect,
+		                            hit,
+		                            &indirect_sd,
+		                            &emission_sd,
+		                            L);
 #endif  /* __VOLUME__ */
 
+		/* Shade background. */
 		if(!hit) {
-			/* eval background shader if nothing hit */
-			if(kernel_data.background.transparent) {
-				L_transparent += average(throughput);
-
-#ifdef __PASSES__
-				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif  /* __PASSES__ */
-					break;
-			}
-
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
-#endif  /* __BACKGROUND__ */
-
+			kernel_path_background(kg, &state, &ray, throughput, &emission_sd, L);
 			break;
 		}
 
-		/* setup shading */
+		/* Setup and evaluate shader. */
 		shader_setup_from_ray(kg, &sd, &isect, &ray);
-		shader_eval_surface(kg, &sd, rng, &state, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
+		shader_eval_surface(kg, &sd, &state, state.flag);
 		shader_merge_closures(&sd);
 
-		/* holdout */
-#ifdef __HOLDOUT__
-		if(sd.flag & (SD_HOLDOUT|SD_HOLDOUT_MASK)) {
-			if(kernel_data.background.transparent) {
-				float3 holdout_weight;
-				
-				if(sd.flag & SD_HOLDOUT_MASK)
-					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-				else
-					holdout_weight = shader_holdout_eval(kg, &sd);
-
-				/* any throughput is ok, should all be identical here */
-				L_transparent += average(holdout_weight*throughput);
-			}
-
-			if(sd.flag & SD_HOLDOUT_MASK)
-				break;
-		}
-#endif  /* __HOLDOUT__ */
-
-		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
-
-#ifdef __EMISSION__
-		/* emission */
-		if(sd.flag & SD_EMISSION) {
-			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+		/* Apply shadow catcher, holdout, emission. */
+		if(!kernel_path_shader_apply(kg,
+		                             &sd,
+		                             &state,
+		                             &ray,
+		                             throughput,
+		                             &emission_sd,
+		                             L,
+		                             buffer))
+		{
+			break;
 		}
-#endif  /* __EMISSION__ */
 
 		/* transparency termination */
 		if(state.flag & PATH_RAY_TRANSPARENT) {
 			/* path termination. this is a strange place to put the termination, it's
 			 * mainly due to the mixed in MIS that we use. gives too many unneeded
 			 * shader evaluations, only need emission if we are going to terminate */
-			float probability = path_state_terminate_probability(kg, &state, throughput);
+			float probability = path_state_continuation_probability(kg, &state, throughput);
 
 			if(probability == 0.0f) {
 				break;
 			}
 			else if(probability != 1.0f) {
-				float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+				float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE);
 
 				if(terminate >= probability)
 					break;
@@ -517,10 +508,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			}
 		}
 
+		kernel_update_denoising_features(kg, &sd, &state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
+			kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, throughput);
 		}
 #endif  /* __AO__ */
 
@@ -528,7 +521,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		/* bssrdf scatter to a different location on the same object */
 		if(sd.flag & SD_BSSRDF) {
 			kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
-			                                        &L, &state, rng, &ray, throughput);
+			                                        L, &state, &ray, throughput);
 		}
 #endif  /* __SUBSURFACE__ */
 
@@ -538,15 +531,16 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __EMISSION__
 			/* direct light */
 			if(kernel_data.integrator.use_direct_light) {
-				int all = kernel_data.integrator.sample_all_lights_direct;
-				kernel_branched_path_surface_connect_light(kg, rng,
-					&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
+				int all = (kernel_data.integrator.sample_all_lights_direct) ||
+				          (state.flag & PATH_RAY_SHADOW_CATCHER);
+				kernel_branched_path_surface_connect_light(kg,
+					&sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
 			}
 #endif  /* __EMISSION__ */
 
 			/* indirect light */
-			kernel_branched_path_surface_indirect_light(kg, rng,
-				&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
+			kernel_branched_path_surface_indirect_light(kg,
+				&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
 
 			/* continue in case of transparency */
 			throughput *= shader_bsdf_transparency(kg, &sd);
@@ -574,50 +568,35 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
 #endif  /* __VOLUME__ */
 	}
-
-	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
-
-	kernel_write_light_passes(kg, buffer, &L, sample);
-
-#ifdef __KERNEL_DEBUG__
-	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
-#endif  /* __KERNEL_DEBUG__ */
-
-	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
 }
 
 ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
-	ccl_global float *buffer, ccl_global uint *rng_state,
+	ccl_global float *buffer,
 	int sample, int x, int y, int offset, int stride)
 {
 	/* buffer offset */
 	int index = offset + x + y*stride;
 	int pass_stride = kernel_data.film.pass_stride;
 
-	rng_state += index;
 	buffer += index*pass_stride;
 
 	/* initialize random numbers and ray */
-	RNG rng;
+	uint rng_hash;
 	Ray ray;
 
-	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
+	kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
 
 	/* integrate */
-	float4 L;
-
-	if(ray.t != 0.0f)
-		L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
-	else
-		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-	/* accumulate result in output buffer */
-	kernel_write_pass_float4(buffer, sample, L);
+	PathRadiance L;
 
-	path_rng_end(kg, rng_state, rng);
+	if(ray.t != 0.0f) {
+		kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L);
+		kernel_write_result(kg, buffer, sample, &L);
+	}
 }
 
+#endif  /* __SPLIT_KERNEL__ */
+
 #endif  /* __BRANCHED_PATH__ */
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
index 13597eab287..d83fd474cde 100644
--- a/intern/cycles/kernel/kernel_path_common.h
+++ b/intern/cycles/kernel/kernel_path_common.h
@@ -14,15 +14,14 @@
  * limitations under the License.
  */
 
-#include "util_hash.h"
+#include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
 ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
-                                               ccl_global uint *rng_state,
                                                int sample,
                                                int x, int y,
-                                               ccl_addr_space RNG *rng,
+                                               uint *rng_hash,
                                                ccl_addr_space Ray *ray)
 {
 	float filter_u;
@@ -30,24 +29,20 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
 
 	int num_samples = kernel_data.integrator.aa_samples;
 
-	if(sample == 0) {
-		*rng_state = hash_int_2d(x, y);
-	}
-
-	path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
+	path_rng_init(kg, sample, num_samples, rng_hash, x, y, &filter_u, &filter_v);
 
 	/* sample camera ray */
 
 	float lens_u = 0.0f, lens_v = 0.0f;
 
 	if(kernel_data.cam.aperturesize > 0.0f)
-		path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
+		path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
 
 	float time = 0.0f;
 
 #ifdef __CAMERA_MOTION__
 	if(kernel_data.cam.shuttertime != -1.0f)
-		time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
+		time = path_rng_1D(kg, *rng_hash, sample, num_samples, PRNG_TIME);
 #endif
 
 	camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index 661dc52fb31..eccee54c0e3 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -19,15 +19,17 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void path_state_init(KernelGlobals *kg,
                                        ShaderData *stack_sd,
                                        ccl_addr_space PathState *state,
-                                       ccl_addr_space RNG *rng,
+                                       uint rng_hash,
                                        int sample,
                                        ccl_addr_space Ray *ray)
 {
 	state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP;
 
+	state->rng_hash = rng_hash;
 	state->rng_offset = PRNG_BASE_NUM;
 	state->sample = sample;
 	state->num_samples = kernel_data.integrator.aa_samples;
+	state->branch_factor = 1.0f;
 
 	state->bounce = 0;
 	state->diffuse_bounce = 0;
@@ -35,6 +37,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
 	state->transmission_bounce = 0;
 	state->transparent_bounce = 0;
 
+#ifdef __DENOISING_FEATURES__
+	if(kernel_data.film.pass_denoising_data) {
+		state->flag |= PATH_RAY_STORE_SHADOW_INFO;
+		state->denoising_feature_weight = 1.0f;
+	}
+	else {
+		state->denoising_feature_weight = 0.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
 	state->min_ray_pdf = FLT_MAX;
 	state->ray_pdf = 0.0f;
 #ifdef __LAMP_MIS__
@@ -48,7 +60,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
 		/* Initialize volume stack with volume we are inside of. */
 		kernel_volume_stack_init(kg, stack_sd, state, ray, state->volume_stack);
 		/* Seed RNG for cases where we can't use stratified samples .*/
-		state->rng_congruential = lcg_init(*rng + sample*0x51633e2d);
+		state->rng_congruential = lcg_init(rng_hash + sample*0x51633e2d);
 	}
 	else {
 		state->volume_stack[0].shader = SHADER_NONE;
@@ -64,12 +76,12 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
 		state->flag |= PATH_RAY_TRANSPARENT;
 		state->transparent_bounce++;
 
-		/* don't increase random number generator offset here, to avoid some
-		 * unwanted patterns, see path_state_rng_1D_for_decision */
-
 		if(!kernel_data.integrator.transparent_shadows)
 			state->flag |= PATH_RAY_MIS_SKIP;
 
+		/* random number generator next bounce */
+		state->rng_offset += PRNG_BOUNCE_NUM;
+
 		return;
 	}
 
@@ -124,9 +136,15 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
 
 	/* random number generator next bounce */
 	state->rng_offset += PRNG_BOUNCE_NUM;
+
+#ifdef __DENOISING_FEATURES__
+	if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
+		state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
+	}
+#endif
 }
 
-ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
+ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, ccl_addr_space PathState *state)
 {
 	uint flag = state->flag & PATH_RAY_ALL_VISIBILITY;
 
@@ -140,17 +158,28 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s
 	return flag;
 }
 
-ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput)
+ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg,
+                                                            ccl_addr_space PathState *state,
+                                                            const float3 throughput)
 {
 	if(state->flag & PATH_RAY_TRANSPARENT) {
-		/* transparent rays treated separately */
-		if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce)
+		/* Transparent rays are treated separately with own max bounces. */
+		if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
 			return 0.0f;
-		else if(state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce)
+		}
+		/* Do at least one bounce without RR. */
+		else if(state->transparent_bounce <= 1) {
 			return 1.0f;
+		}
+#ifdef __SHADOW_TRICKS__
+		/* Exception for shadow catcher not working correctly with RR. */
+		else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) {
+			return 1.0f;
+		}
+#endif
 	}
 	else {
-		/* other rays */
+		/* Test max bounces for various ray types. */
 		if((state->bounce >= kernel_data.integrator.max_bounce) ||
 		   (state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) ||
 		   (state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) ||
@@ -161,13 +190,21 @@ ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_
 		{
 			return 0.0f;
 		}
-		else if(state->bounce <= kernel_data.integrator.min_bounce) {
+		/* Do at least one bounce without RR. */
+		else if(state->bounce <= 1) {
 			return 1.0f;
 		}
+#ifdef __SHADOW_TRICKS__
+		/* Exception for shadow catcher not working correctly with RR. */
+		else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) {
+			return 1.0f;
+		}
+#endif
 	}
 
-	/* probalistic termination */
-	return average(throughput); /* todo: try using max here */
+	/* Probalistic termination: use sqrt() to roughly match typical view
+	 * transform and do path termination a bit later on average. */
+	return min(sqrtf(max3(fabs(throughput)) * state->branch_factor), 1.0f);
 }
 
 /* TODO(DingTo): Find more meaningful name for this */
@@ -180,5 +217,30 @@ ccl_device_inline void path_state_modify_bounce(ccl_addr_space PathState *state,
 		state->bounce -= 1;
 }
 
+ccl_device_inline bool path_state_ao_bounce(KernelGlobals *kg, ccl_addr_space PathState *state)
+{
+    if(state->bounce <= kernel_data.integrator.ao_bounces) {
+        return false;
+    }
+
+    int bounce = state->bounce - state->transmission_bounce - (state->glossy_bounce > 0);
+    return (bounce > kernel_data.integrator.ao_bounces);
+}
+
+ccl_device_inline void path_state_branch(ccl_addr_space PathState *state,
+                                         int branch,
+                                         int num_branches)
+{
+	state->rng_offset += PRNG_BOUNCE_NUM;
+
+	if(num_branches > 1) {
+		/* Path is splitting into a branch, adjust so that each branch
+		 * still gets a unique sample from the same sequence. */
+		state->sample = state->sample*num_branches + branch;
+		state->num_samples = state->num_samples*num_branches;
+		state->branch_factor *= num_branches;
+	}
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h
new file mode 100644
index 00000000000..1436e8e5a5b
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_subsurface.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __SUBSURFACE__
+#  ifndef __KERNEL_CUDA__
+ccl_device
+#  else
+ccl_device_inline
+#  endif
+bool kernel_path_subsurface_scatter(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        ShaderData *emission_sd,
+        PathRadiance *L,
+        ccl_addr_space PathState *state,
+        ccl_addr_space Ray *ray,
+        ccl_addr_space float3 *throughput,
+        ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
+{
+	float bssrdf_u, bssrdf_v;
+	path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+
+	const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
+
+	/* do bssrdf scatter step if we picked a bssrdf closure */
+	if(sc) {
+		/* We should never have two consecutive BSSRDF bounces,
+		 * the second one should be converted to a diffuse BSDF to
+		 * avoid this.
+		 */
+		kernel_assert(!(state->flag & PATH_RAY_DIFFUSE_ANCESTOR));
+
+		uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
+
+		SubsurfaceIntersection ss_isect;
+		int num_hits = subsurface_scatter_multi_intersect(kg,
+		                                                  &ss_isect,
+		                                                  sd,
+		                                                  sc,
+		                                                  &lcg_state,
+		                                                  bssrdf_u, bssrdf_v,
+		                                                  false);
+#  ifdef __VOLUME__
+		bool need_update_volume_stack =
+		        kernel_data.integrator.use_volumes &&
+		        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#  endif  /* __VOLUME__ */
+
+		/* compute lighting with the BSDF closure */
+		for(int hit = 0; hit < num_hits; hit++) {
+			/* NOTE: We reuse the existing ShaderData, we assume the path
+			 * integration loop stops when this function returns true.
+			 */
+			subsurface_scatter_multi_setup(kg,
+			                               &ss_isect,
+			                               hit,
+			                               sd,
+			                               state,
+			                               state->flag,
+			                               sc,
+			                               false);
+
+			kernel_path_surface_connect_light(kg, sd, emission_sd, *throughput, state, L);
+
+			ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
+			ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
+			ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
+			PathRadianceState *hit_L_state = &ss_indirect->L_state[ss_indirect->num_rays];
+
+			*hit_state = *state;
+			*hit_ray = *ray;
+			*hit_tp = *throughput;
+			*hit_L_state = L->state;
+
+			hit_state->rng_offset += PRNG_BOUNCE_NUM;
+
+			if(kernel_path_surface_bounce(kg,
+			                              sd,
+			                              hit_tp,
+			                              hit_state,
+			                              hit_L_state,
+			                              hit_ray))
+			{
+#  ifdef __LAMP_MIS__
+				hit_state->ray_t = 0.0f;
+#  endif  /* __LAMP_MIS__ */
+
+#  ifdef __VOLUME__
+				if(need_update_volume_stack) {
+					Ray volume_ray = *ray;
+					/* Setup ray from previous surface point to the new one. */
+					volume_ray.D = normalize_len(hit_ray->P - volume_ray.P,
+					                             &volume_ray.t);
+
+					kernel_volume_stack_update_for_subsurface(
+					    kg,
+					    emission_sd,
+					    &volume_ray,
+					    hit_state->volume_stack);
+				}
+#  endif  /* __VOLUME__ */
+				ss_indirect->num_rays++;
+			}
+		}
+		return true;
+	}
+	return false;
+}
+
+ccl_device_inline void kernel_path_subsurface_init_indirect(
+        ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
+{
+	ss_indirect->num_rays = 0;
+}
+
+ccl_device void kernel_path_subsurface_setup_indirect(
+        KernelGlobals *kg,
+        ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
+        ccl_addr_space PathState *state,
+        ccl_addr_space Ray *ray,
+        PathRadiance *L,
+        ccl_addr_space float3 *throughput)
+{
+	/* Setup state, ray and throughput for indirect SSS rays. */
+	ss_indirect->num_rays--;
+
+	path_radiance_sum_indirect(L);
+	path_radiance_reset_indirect(L);
+
+	*state = ss_indirect->state[ss_indirect->num_rays];
+	*ray = ss_indirect->rays[ss_indirect->num_rays];
+	L->state = ss_indirect->L_state[ss_indirect->num_rays];
+	*throughput = ss_indirect->throughputs[ss_indirect->num_rays];
+
+	state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
+}
+
+#endif  /* __SUBSURFACE__ */
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index fea503d06e5..7b566b01b04 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -16,16 +16,21 @@
 
 CCL_NAMESPACE_BEGIN
 
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__)
-
+#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || defined(__BAKING__)
 /* branched path tracing: connect path directly to position on one or more lights and add it to L */
-ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, ShaderData *emission_sd, PathState *state, float3 throughput,
-	float num_samples_adjust, PathRadiance *L, int sample_all_lights)
+ccl_device_noinline void kernel_branched_path_surface_connect_light(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        ShaderData *emission_sd,
+        ccl_addr_space PathState *state,
+        float3 throughput,
+        float num_samples_adjust,
+        PathRadiance *L,
+        int sample_all_lights)
 {
 #ifdef __EMISSION__
 	/* sample illumination from lights to find path contribution */
-	if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))
+	if(!(sd->flag & SD_BSDF_HAS_EVAL))
 		return;
 
 	Ray light_ray;
@@ -33,7 +38,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 	bool is_lamp;
 
 #  ifdef __OBJECT_MOTION__
-	light_ray.time = ccl_fetch(sd, time);
+	light_ray.time = sd->time;
 #  endif
 
 	if(sample_all_lights) {
@@ -44,15 +49,15 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 
 			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
 			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
-			RNG lamp_rng = cmj_hash(*rng, i);
+			uint lamp_rng_hash = cmj_hash(state->rng_hash, i);
 
 			for(int j = 0; j < num_samples; j++) {
 				float light_u, light_v;
-				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-				float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples);
+				path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				float terminate = path_branched_rng_light_termination(kg, lamp_rng_hash, state, j, num_samples);
 
 				LightSample ls;
-				if(lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls)) {
+				if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
 					/* The sampling probability returned by lamp_light_sample assumes that all lights were sampled.
 					 * However, this code only samples lamps, so if the scene also had mesh lights, the real probability is twice as high. */
 					if(kernel_data.integrator.pdf_triangles != 0.0f)
@@ -62,9 +67,12 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 						/* trace shadow ray */
 						float3 shadow;
 
-						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+						if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
+						}
+						else {
+							path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
 						}
 					}
 				}
@@ -77,17 +85,16 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 			float num_samples_inv = num_samples_adjust/num_samples;
 
 			for(int j = 0; j < num_samples; j++) {
-				float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
 				float light_u, light_v;
-				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-				float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+				path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples);
 
 				/* only sample triangle lights */
 				if(kernel_data.integrator.num_all_lights)
-					light_t = 0.5f*light_t;
+					light_u = 0.5f*light_u;
 
 				LightSample ls;
-				if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+				if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 					/* Same as above, probability needs to be corrected since the sampling was forced to select a mesh light. */
 					if(kernel_data.integrator.num_all_lights)
 						ls.pdf *= 2.0f;
@@ -96,9 +103,12 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 						/* trace shadow ray */
 						float3 shadow;
 
-						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+						if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
+						}
+						else {
+							path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
 						}
 					}
 				}
@@ -107,21 +117,23 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 	}
 	else {
 		/* sample one light at random */
-		float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 		float light_u, light_v;
-		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-		float terminate = path_state_rng_light_termination(kg, rng, state);
+		path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
+		float terminate = path_state_rng_light_termination(kg, state);
 
 		LightSample ls;
-		if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+		if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 			/* sample random light */
 			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 				/* trace shadow ray */
 				float3 shadow;
 
-				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+				if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 					/* accumulate */
-					path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
+					path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
+				}
+				else {
+					path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light);
 				}
 			}
 		}
@@ -130,9 +142,17 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 }
 
 /* branched path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, const ShaderClosure *sc, int sample, int num_samples,
-	float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+ccl_device bool kernel_branched_path_surface_bounce(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        const ShaderClosure *sc,
+        int sample,
+        int num_samples,
+        ccl_addr_space float3 *throughput,
+        ccl_addr_space PathState *state,
+        PathRadianceState *L_state,
+        ccl_addr_space Ray *ray,
+        float sum_sample_weight)
 {
 	/* sample BSDF */
 	float bsdf_pdf;
@@ -140,7 +160,7 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 	float3 bsdf_omega_in;
 	differential3 bsdf_domega_in;
 	float bsdf_u, bsdf_v;
-	path_branched_rng_2D(kg, rng, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+	path_branched_rng_2D(kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 	int label;
 
 	label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval,
@@ -150,21 +170,25 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 		return false;
 
 	/* modify throughput */
-	path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+	path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+
+#ifdef __DENOISING_FEATURES__
+	state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
+#endif
 
 	/* modify path state */
 	path_state_next(kg, state, label);
 
 	/* setup ray */
-	ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+	ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
 	ray->D = normalize(bsdf_omega_in);
 	ray->t = FLT_MAX;
 #ifdef __RAY_DIFFERENTIALS__
-	ray->dP = ccl_fetch(sd, dP);
+	ray->dP = sd->dP;
 	ray->dD = bsdf_domega_in;
 #endif
 #ifdef __OBJECT_MOTION__
-	ray->time = ccl_fetch(sd, time);
+	ray->time = sd->time;
 #endif
 
 #ifdef __VOLUME__
@@ -188,64 +212,77 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 
 #endif
 
-#ifndef __SPLIT_KERNEL__
 /* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng,
+ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg,
 	ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state,
 	PathRadiance *L)
 {
 #ifdef __EMISSION__
-	if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
+	if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
 		return;
 
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		kernel_branched_path_surface_connect_light(kg,
+		                                           sd,
+		                                           emission_sd,
+		                                           state,
+		                                           throughput,
+		                                           1.0f,
+		                                           L,
+		                                           1);
+		return;
+	}
+#endif
+
 	/* sample illumination from lights to find path contribution */
-	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 	float light_u, light_v;
-	path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+	path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 	Ray light_ray;
 	BsdfEval L_light;
 	bool is_lamp;
 
 #ifdef __OBJECT_MOTION__
-	light_ray.time = ccl_fetch(sd, time);
+	light_ray.time = sd->time;
 #endif
 
 	LightSample ls;
-	if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
-		float terminate = path_state_rng_light_termination(kg, rng, state);
+	if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+		float terminate = path_state_rng_light_termination(kg, state);
 		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 			/* trace shadow ray */
 			float3 shadow;
 
-			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+			if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 				/* accumulate */
-				path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+				path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
+			}
+			else {
+				path_radiance_accum_total_light(L, state, throughput, &L_light);
 			}
 		}
 	}
 #endif
 }
-#endif
 
 /* path tracing: bounce off or through surface to with new direction stored in ray */
 ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
-                                           ccl_addr_space RNG *rng,
                                            ShaderData *sd,
                                            ccl_addr_space float3 *throughput,
                                            ccl_addr_space PathState *state,
-                                           PathRadiance *L,
+                                           PathRadianceState *L_state,
                                            ccl_addr_space Ray *ray)
 {
 	/* no BSDF? we can stop here */
-	if(ccl_fetch(sd, flag) & SD_BSDF) {
+	if(sd->flag & SD_BSDF) {
 		/* sample BSDF */
 		float bsdf_pdf;
 		BsdfEval bsdf_eval;
 		float3 bsdf_omega_in;
 		differential3 bsdf_domega_in;
 		float bsdf_u, bsdf_v;
-		path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+		path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 		int label;
 
 		label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval,
@@ -255,7 +292,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
 			return false;
 
 		/* modify throughput */
-		path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+		path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
 
 		/* set labels */
 		if(!(label & LABEL_TRANSPARENT)) {
@@ -270,16 +307,16 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
 		path_state_next(kg, state, label);
 
 		/* setup ray */
-		ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+		ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
 		ray->D = normalize(bsdf_omega_in);
 
 		if(state->bounce == 0)
-			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
+			ray->t -= sd->ray_length; /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;
 
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 		ray->dD = bsdf_domega_in;
 #endif
 
@@ -291,21 +328,21 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
 		return true;
 	}
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) {
+	else if(sd->flag & SD_HAS_ONLY_VOLUME) {
 		/* no surface shader but have a volume shader? act transparent */
 
 		/* update path state, count as transparent */
 		path_state_next(kg, state, LABEL_TRANSPARENT);
 
 		if(state->bounce == 0)
-			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
+			ray->t -= sd->ray_length; /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;
 
 		/* setup ray position, direction stays unchanged */
-		ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
+		ray->P = ray_offset(sd->P, -sd->Ng);
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 #endif
 
 		/* enter/exit volume */
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index 3d3b7385d8b..b6a856baf24 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -20,11 +20,10 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline void kernel_path_volume_connect_light(
         KernelGlobals *kg,
-        RNG *rng,
         ShaderData *sd,
         ShaderData *emission_sd,
         float3 throughput,
-        PathState *state,
+        ccl_addr_space PathState *state,
         PathRadiance *L)
 {
 #ifdef __EMISSION__
@@ -32,9 +31,8 @@ ccl_device_inline void kernel_path_volume_connect_light(
 		return;
 
 	/* sample illumination from lights to find path contribution */
-	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 	float light_u, light_v;
-	path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+	path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 	Ray light_ray;
 	BsdfEval L_light;
@@ -42,24 +40,22 @@ ccl_device_inline void kernel_path_volume_connect_light(
 	bool is_lamp;
 
 	/* connect to light from given point where shader has been evaluated */
-#  ifdef __OBJECT_MOTION__
 	light_ray.time = sd->time;
-#  endif
 
-	if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls))
+	if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls))
 	{
-		float terminate = path_state_rng_light_termination(kg, rng, state);
+		float terminate = path_state_rng_light_termination(kg, state);
 		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 			/* trace shadow ray */
 			float3 shadow;
 
-			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+			if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 				/* accumulate */
-				path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+				path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
 			}
 		}
 	}
-#endif
+#endif /* __EMISSION__ */
 }
 
 #ifdef __KERNEL_GPU__
@@ -67,8 +63,13 @@ ccl_device_noinline
 #else
 ccl_device
 #endif
-bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+bool kernel_path_volume_bounce(
+    KernelGlobals *kg,
+    ShaderData *sd,
+    ccl_addr_space float3 *throughput,
+    ccl_addr_space PathState *state,
+    PathRadianceState *L_state,
+    ccl_addr_space Ray *ray)
 {
 	/* sample phase function */
 	float phase_pdf;
@@ -76,7 +77,7 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 	float3 phase_omega_in;
 	differential3 phase_domega_in;
 	float phase_u, phase_v;
-	path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v);
+	path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v);
 	int label;
 
 	label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval,
@@ -86,7 +87,7 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 		return false;
 	
 	/* modify throughput */
-	path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label);
+	path_radiance_bsdf_bounce(kg, L_state, throughput, &phase_eval, phase_pdf, state->bounce, label);
 
 	/* set labels */
 	state->ray_pdf = phase_pdf;
@@ -111,9 +112,17 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 	return true;
 }
 
-ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, ShaderData *emission_sd, float3 throughput, PathState *state, PathRadiance *L,
-	bool sample_all_lights, Ray *ray, const VolumeSegment *segment)
+#ifndef __SPLIT_KERNEL__
+ccl_device void kernel_branched_path_volume_connect_light(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        ShaderData *emission_sd,
+        float3 throughput,
+        ccl_addr_space PathState *state,
+        PathRadiance *L,
+        bool sample_all_lights,
+        Ray *ray,
+        const VolumeSegment *segment)
 {
 #ifdef __EMISSION__
 	if(!kernel_data.integrator.use_direct_light)
@@ -123,9 +132,7 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 	BsdfEval L_light;
 	bool is_lamp;
 
-#  ifdef __OBJECT_MOTION__
 	light_ray.time = sd->time;
-#  endif
 
 	if(sample_all_lights) {
 		/* lamp sampling */
@@ -135,12 +142,12 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 
 			int num_samples = light_select_num_samples(kg, i);
 			float num_samples_inv = 1.0f/(num_samples*kernel_data.integrator.num_all_lights);
-			RNG lamp_rng = cmj_hash(*rng, i);
+			uint lamp_rng_hash = cmj_hash(state->rng_hash, i);
 
 			for(int j = 0; j < num_samples; j++) {
 				/* sample random position on given light */
 				float light_u, light_v;
-				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
 				LightSample ls;
 				lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls);
@@ -148,28 +155,26 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 				float3 tp = throughput;
 
 				/* sample position on volume segment */
-				float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
-				float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+				float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
+				float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
 
 				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 					state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
 
-				(void)result;
-				kernel_assert(result == VOLUME_PATH_SCATTERED);
-
 				/* todo: split up light_sample so we don't have to call it again with new position */
-				if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
+				if(result == VOLUME_PATH_SCATTERED &&
+				   lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
 					if(kernel_data.integrator.pdf_triangles != 0.0f)
 						ls.pdf *= 2.0f;
 
-					float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+					float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples);
 					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 						/* trace shadow ray */
 						float3 shadow;
 
-						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+						if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 					}
 				}
@@ -183,42 +188,39 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 
 			for(int j = 0; j < num_samples; j++) {
 				/* sample random position on random triangle */
-				float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT);
 				float light_u, light_v;
-				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
 				/* only sample triangle lights */
 				if(kernel_data.integrator.num_all_lights)
-					light_t = 0.5f*light_t;
+					light_u = 0.5f*light_u;
 
 				LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
+				light_sample(kg, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
 
 				float3 tp = throughput;
 
 				/* sample position on volume segment */
-				float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
-				float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+				float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
+				float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
 
 				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 					state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
 					
-				(void)result;
-				kernel_assert(result == VOLUME_PATH_SCATTERED);
-
 				/* todo: split up light_sample so we don't have to call it again with new position */
-				if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+				if(result == VOLUME_PATH_SCATTERED &&
+				   light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 					if(kernel_data.integrator.num_all_lights)
 						ls.pdf *= 2.0f;
 
-					float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+					float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples);
 					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 						/* trace shadow ray */
 						float3 shadow;
 
-						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+						if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 					}
 				}
@@ -227,44 +229,42 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 	}
 	else {
 		/* sample random position on random light */
-		float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 		float light_u, light_v;
-		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+		path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 		LightSample ls;
-		light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
+		light_sample(kg, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
 
 		float3 tp = throughput;
 
 		/* sample position on volume segment */
-		float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
-		float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+		float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
+		float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
 
 		VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 			state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
 			
-		(void)result;
-		kernel_assert(result == VOLUME_PATH_SCATTERED);
-
 		/* todo: split up light_sample so we don't have to call it again with new position */
-		if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+		if(result == VOLUME_PATH_SCATTERED &&
+		   light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 			/* sample random light */
-			float terminate = path_state_rng_light_termination(kg, rng, state);
+			float terminate = path_state_rng_light_termination(kg, state);
 			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 				/* trace shadow ray */
 				float3 shadow;
 
-				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+				if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 					/* accumulate */
-					path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+					path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
 				}
 			}
 		}
 	}
-#endif
+#endif /* __EMISSION__ */
 }
+#endif /* __SPLIT_KERNEL__ */
 
-#endif
+#endif /* __VOLUME_SCATTER__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index 9a2b0884a7e..cbb2442d1dc 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi)
 
 ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
 {
+	if(is_zero(dir))
+		return make_float2(0.0f, 0.0f);
+
 	float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
 	float v = (acosf(dir.z / len(dir)) - range.w) / range.z;
 
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index cf5614b8a86..e32d4bbbc1b 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -17,12 +17,15 @@
 #ifndef __KERNEL_QUEUE_H__
 #define __KERNEL_QUEUE_H__
 
+CCL_NAMESPACE_BEGIN
+
 /*
  * Queue utility functions for split kernel
  */
-
+#ifdef __KERNEL_OPENCL__
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#endif
 
 /*
  * Enqueue ray index into the queue
@@ -35,7 +38,8 @@ ccl_device void enqueue_ray_index(
         ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */
 {
 	/* This thread's queue index. */
-	int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size);
+	int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint*)&queue_index[queue_number])
+	                   + (queue_number * queue_size);
 	queues[my_queue_index] = ray_index;
 }
 
@@ -47,6 +51,7 @@ ccl_device void enqueue_ray_index(
  * is no more ray to allocate to other threads.
  */
 ccl_device int get_ray_index(
+        KernelGlobals *kg,
         int thread_index,       /* Global thread index. */
         int queue_number,       /* Queue to operate on. */
         ccl_global int *queues, /* Buffer of all queues. */
@@ -68,24 +73,25 @@ ccl_device void enqueue_ray_index_local(
         int queue_number,                            /* Queue in which to enqueue ray index. */
         char enqueue_flag,                           /* True for threads whose ray index has to be enqueued. */
         int queuesize,                               /* queue size. */
-        ccl_local unsigned int *local_queue_atomics,   /* To to local queue atomics. */
+        ccl_local_param unsigned int *local_queue_atomics,   /* To to local queue atomics. */
         ccl_global int *Queue_data,                  /* Queues. */
         ccl_global int *Queue_index)                 /* To do global queue atomics. */
 {
-	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
 
 	/* Get local queue id .*/
 	unsigned int lqidx;
 	if(enqueue_flag) {
-		lqidx = atomic_inc(local_queue_atomics);
+		lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics);
 	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	/* Get global queue offset. */
 	if(lidx == 0) {
-		*local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics);
+		*local_queue_atomics = atomic_fetch_and_add_uint32((ccl_global uint*)&Queue_index[queue_number],
+		                                                   *local_queue_atomics);
 	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	/* Get global queue index and enqueue ray. */
 	if(enqueue_flag) {
@@ -96,19 +102,19 @@ ccl_device void enqueue_ray_index_local(
 
 ccl_device unsigned int get_local_queue_index(
         int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
-        ccl_local unsigned int *local_queue_atomics)
+        ccl_local_param unsigned int *local_queue_atomics)
 {
-	int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]);
+	int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]);
 	return my_lqidx;
 }
 
 ccl_device unsigned int get_global_per_queue_offset(
         int queue_number,
-        ccl_local unsigned int *local_queue_atomics,
+        ccl_local_param unsigned int *local_queue_atomics,
         ccl_global int* global_queue_atomics)
 {
-	unsigned int queue_offset = atomic_add(&global_queue_atomics[queue_number],
-	                                       local_queue_atomics[queue_number]);
+	unsigned int queue_offset = atomic_fetch_and_add_uint32((ccl_global uint*)&global_queue_atomics[queue_number],
+	                                                        local_queue_atomics[queue_number]);
 	return queue_offset;
 }
 
@@ -116,10 +122,27 @@ ccl_device unsigned int get_global_queue_index(
     int queue_number,
     int queuesize,
     unsigned int lqidx,
-    ccl_local unsigned int * global_per_queue_offset)
+    ccl_local_param unsigned int * global_per_queue_offset)
 {
 	int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
 	return my_gqidx;
 }
 
+ccl_device int dequeue_ray_index(
+        int queue_number,
+        ccl_global int *queues,
+        int queue_size,
+        ccl_global int *queue_index)
+{
+	int index = atomic_fetch_and_dec_uint32((ccl_global uint*)&queue_index[queue_number])-1;
+
+	if(index < 0) {
+		return QUEUE_EMPTY_SLOT;
+	}
+
+	return queues[index + queue_number * queue_size];
+}
+
+CCL_NAMESPACE_END
+
 #endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 2b767da5041..e7a6134b8eb 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -14,222 +14,130 @@
  * limitations under the License.
  */
 
-#include "kernel_jitter.h"
+#include "kernel/kernel_jitter.h"
+#include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __SOBOL__
-
-/* skip initial numbers that are not as well distributed, especially the
- * first sequence is just 0 everywhere, which can be problematic for e.g.
- * path termination */
-#define SOBOL_SKIP 64
-
-/* High Dimensional Sobol */
+/* Pseudo random numbers, uncomment this for debugging correlations. Only run
+ * this single threaded on a CPU for repeatable resutls. */
+//#define __DEBUG_CORRELATION__
 
-/* van der corput radical inverse */
-ccl_device uint van_der_corput(uint bits)
-{
-	bits = (bits << 16) | (bits >> 16);
-	bits = ((bits & 0x00ff00ff) << 8) | ((bits & 0xff00ff00) >> 8);
-	bits = ((bits & 0x0f0f0f0f) << 4) | ((bits & 0xf0f0f0f0) >> 4);
-	bits = ((bits & 0x33333333) << 2) | ((bits & 0xcccccccc) >> 2);
-	bits = ((bits & 0x55555555) << 1) | ((bits & 0xaaaaaaaa) >> 1);
-	return bits;
-}
 
-/* sobol radical inverse */
-ccl_device uint sobol(uint i)
-{
-	uint r = 0;
-
-	for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1)
-		if(i & 1)
-			r ^= v;
-
-	return r;
-}
-
-/* inverse of sobol radical inverse */
-ccl_device uint sobol_inverse(uint i)
-{
-	const uint msb = 1U << 31;
-	uint r = 0;
-
-	for(uint v = 1; i; i <<= 1, v ^= v << 1)
-		if(i & msb)
-			r ^= v;
+/* High Dimensional Sobol.
+ *
+ * Multidimensional sobol with generator matrices. Dimension 0 and 1 are equal
+ * to classic Van der Corput and Sobol sequences. */
 
-	return r;
-}
+#ifdef __SOBOL__
 
-/* multidimensional sobol with generator matrices
- * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively */
 ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
 {
 	uint result = 0;
 	uint i = index;
-
-	for(uint j = 0; i; i >>= 1, j++)
-		if(i & 1)
+	for(uint j = 0; i; i >>= 1, j++) {
+		if(i & 1) {
 			result ^= kernel_tex_fetch(__sobol_directions, 32*dimension + j);
-	
+		}
+	}
 	return result;
 }
 
-/* lookup index and x/y coordinate, assumes m is a power of two */
-ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, const uint ey, uint *x, uint *y)
-{
-	/* shift is constant per frame */
-	const uint shift = frame << (m << 1);
-	const uint sobol_shift = sobol(shift);
-	/* van der Corput is its own inverse */
-	const uint lower = van_der_corput(ex << (32 - m));
-	/* need to compensate for ey difference and shift */
-	const uint sobol_lower = sobol(lower);
-	const uint mask = ~-(1 << m) << (32 - m); /* only m upper bits */
-	const uint delta = ((ey << (32 - m)) ^ sobol_lower ^ sobol_shift) & mask;
-	/* only use m upper bits for the index (m is a power of two) */
-	const uint sobol_result = delta | (delta >> m);
-	const uint upper = sobol_inverse(sobol_result);
-	const uint index = shift | upper | lower;
-	*x = van_der_corput(index);
-	*y = sobol_shift ^ sobol_result ^ sobol_lower;
-	return index;
-}
+#endif /* __SOBOL__ */
 
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
+
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
+                                         uint rng_hash,
+                                         int sample, int num_samples,
+                                         int dimension)
 {
+#ifdef __DEBUG_CORRELATION__
+	return (float)drand48();
+#endif
+
 #ifdef __CMJ__
-	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
-		/* correlated multi-jittered */
-		int p = *rng + dimension;
+#  ifdef __SOBOL__
+	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
+#  endif
+	{
+		/* Correlated multi-jitter. */
+		int p = rng_hash + dimension;
 		return cmj_sample_1D(sample, num_samples, p);
 	}
 #endif
 
-#ifdef __SOBOL_FULL_SCREEN__
-	uint result = sobol_dimension(kg, *rng, dimension);
-	float r = (float)result * (1.0f/(float)0xFFFFFFFF);
-	return r;
-#else
-	/* compute sobol sequence value using direction vectors */
-	uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension);
+#ifdef __SOBOL__
+	/* Sobol sequence value using direction vectors. */
+	uint result = sobol_dimension(kg, sample, dimension);
 	float r = (float)result * (1.0f/(float)0xFFFFFFFF);
 
 	/* Cranly-Patterson rotation using rng seed */
 	float shift;
 
-	/* using the same *rng value to offset seems to give correlation issues,
-	 * we could hash it with the dimension but this has a performance impact,
-	 * we need to find a solution for this */
-	if(dimension & 1)
-		shift = (*rng >> 16) * (1.0f/(float)0xFFFF);
-	else
-		shift = (*rng & 0xFFFF) * (1.0f/(float)0xFFFF);
+	/* Hash rng with dimension to solve correlation issues.
+	 * See T38710, T50116.
+	 */
+	uint tmp_rng = cmj_hash_simple(dimension, rng_hash);
+	shift = tmp_rng * (1.0f/(float)0xFFFFFFFF);
 
 	return r + shift - floorf(r + shift);
 #endif
 }
 
-ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_forceinline void path_rng_2D(KernelGlobals *kg,
+                                        uint rng_hash,
+                                        int sample, int num_samples,
+                                        int dimension,
+                                        float *fx, float *fy)
 {
+#ifdef __DEBUG_CORRELATION__
+	*fx = (float)drand48();
+	*fy = (float)drand48();
+	return;
+#endif
+
 #ifdef __CMJ__
-	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
-		/* correlated multi-jittered */
-		int p = *rng + dimension;
+#  ifdef __SOBOL__
+	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
+#  endif
+	{
+		/* Correlated multi-jitter. */
+		int p = rng_hash + dimension;
 		cmj_sample_2D(sample, num_samples, p, fx, fy);
+		return;
 	}
-	else
 #endif
-	{
-		/* sobol */
-		*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
-		*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
-	}
-}
-
-ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy)
-{
-#ifdef __SOBOL_FULL_SCREEN__
-	uint px, py;
-	uint bits = 16; /* limits us to 65536x65536 and 65536 samples */
-	uint size = 1 << bits;
-	uint frame = sample;
-
-	*rng = sobol_lookup(bits, frame, x, y, &px, &py);
-
-	*rng ^= kernel_data.integrator.seed;
-
-	if(sample == 0) {
-		*fx = 0.5f;
-		*fy = 0.5f;
-	}
-	else {
-		*fx = size * (float)px * (1.0f/(float)0xFFFFFFFF) - x;
-		*fy = size * (float)py * (1.0f/(float)0xFFFFFFFF) - y;
-	}
-#else
-	*rng = *rng_state;
 
-	*rng ^= kernel_data.integrator.seed;
-
-	if(sample == 0) {
-		*fx = 0.5f;
-		*fy = 0.5f;
-	}
-	else {
-		path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy);
-	}
+#ifdef __SOBOL__
+	/* Sobol. */
+	*fx = path_rng_1D(kg, rng_hash, sample, num_samples, dimension);
+	*fy = path_rng_1D(kg, rng_hash, sample, num_samples, dimension + 1);
 #endif
 }
 
-ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng)
-{
-	/* nothing to do */
-}
-
-#else
-
-/* Linear Congruential Generator */
-
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
-{
-	/* implicit mod 2^32 */
-	rng = (1103515245*(rng) + 12345);
-	return (float)rng * (1.0f/(float)0xFFFFFFFF);
-}
-
-ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension, float *fx, float *fy)
-{
-	*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
-	*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
-}
-
-ccl_device void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device_inline void path_rng_init(KernelGlobals *kg,
+                                     int sample, int num_samples,
+                                     uint *rng_hash,
+                                     int x, int y,
+                                     float *fx, float *fy)
 {
 	/* load state */
-	*rng = *rng_state;
+	*rng_hash = hash_int_2d(x, y);
+	*rng_hash ^= kernel_data.integrator.seed;
 
-	*rng ^= kernel_data.integrator.seed;
+#ifdef __DEBUG_CORRELATION__
+	srand48(*rng_hash + sample);
+#endif
 
 	if(sample == 0) {
 		*fx = 0.5f;
 		*fy = 0.5f;
 	}
 	else {
-		path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy);
+		path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_FILTER_U, fx, fy);
 	}
 }
 
-ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG rng)
-{
-	/* store state for next sample */
-	*rng_state = rng;
-}
-
-#endif
-
 /* Linear Congruential Generator */
 
 ccl_device uint lcg_step_uint(uint *rng)
@@ -259,90 +167,110 @@ ccl_device uint lcg_init(uint seed)
  * dimension to avoid using the same sequence twice.
  *
  * For branches in the path we must be careful not to reuse the same number
- * in a sequence and offset accordingly. */
-
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
-{
-	return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
-}
-
-ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
-{
-	/* the rng_offset is not increased for transparent bounces. if we do then
-	 * fully transparent objects can become subtly visible by the different
-	 * sampling patterns used where the transparent object is.
-	 *
-	 * however for some random numbers that will determine if we next bounce
-	 * is transparent we do need to increase the offset to avoid always making
-	 * the same decision */
-	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
-	return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
-}
+ * in a sequence and offset accordingly.
+ */
 
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg,
+                                          const ccl_addr_space PathState *state,
+                                          int dimension)
 {
-	path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
+	return path_rng_1D(kg,
+	                   state->rng_hash,
+	                   state->sample, state->num_samples,
+	                   state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg,
+                                         const ccl_addr_space PathState *state,
+                                         int dimension,
+                                         float *fx, float *fy)
 {
-	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
+	path_rng_2D(kg,
+	            state->rng_hash,
+	            state->sample, state->num_samples,
+	            state->rng_offset + dimension,
+	            fx, fy);
 }
 
-ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(
+        KernelGlobals *kg,
+        uint rng_hash,
+        const ccl_addr_space PathState *state,
+        int branch,
+        int num_branches,
+        int dimension)
 {
-	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
-	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
+	return path_rng_1D(kg,
+	                   rng_hash,
+	                   state->sample * num_branches + branch,
+	                   state->num_samples * num_branches,
+	                   state->rng_offset + dimension);
 }
 
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline void path_branched_rng_2D(
+        KernelGlobals *kg,
+        uint rng_hash,
+        const ccl_addr_space PathState *state,
+        int branch,
+        int num_branches,
+        int dimension,
+        float *fx, float *fy)
 {
-	path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
+	path_rng_2D(kg,
+	            rng_hash,
+	            state->sample * num_branches + branch,
+	            state->num_samples * num_branches,
+	            state->rng_offset + dimension,
+	            fx, fy);
 }
 
-/* Utitility functions to get light termination value, since it might not be needed in many cases. */
-ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state)
+/* Utitility functions to get light termination value,
+ * since it might not be needed in many cases.
+ */
+ccl_device_inline float path_state_rng_light_termination(
+        KernelGlobals *kg,
+        const ccl_addr_space PathState *state)
 {
 	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-		return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE);
+		return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
 	}
 	return 0.0f;
 }
 
-ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches)
+ccl_device_inline float path_branched_rng_light_termination(
+        KernelGlobals *kg,
+        uint rng_hash,
+        const ccl_addr_space PathState *state,
+        int branch,
+        int num_branches)
 {
 	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-		return path_branched_rng_1D_for_decision(kg, rng, state, branch, num_branches, PRNG_LIGHT_TERMINATE);
+		return path_branched_rng_1D(kg,
+		                            rng_hash,
+		                            state,
+		                            branch,
+		                            num_branches,
+		                            PRNG_LIGHT_TERMINATE);
 	}
 	return 0.0f;
 }
 
-ccl_device_inline void path_state_branch(PathState *state, int branch, int num_branches)
-{
-	/* path is splitting into a branch, adjust so that each branch
-	 * still gets a unique sample from the same sequence */
-	state->rng_offset += PRNG_BOUNCE_NUM;
-	state->sample = state->sample*num_branches + branch;
-	state->num_samples = state->num_samples*num_branches;
-}
-
-ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble)
+ccl_device_inline uint lcg_state_init(PathState *state,
+                                      uint scramble)
 {
-	return lcg_init(*rng + state->rng_offset + state->sample*scramble);
+	return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble);
 }
 
-/* TODO(sergey): For until we can use generic address space from OpenCL 2.0. */
-
-ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space RNG *rng,
-                                                const ccl_addr_space PathState *state,
+ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space PathState *state,
                                                 uint scramble)
 {
-	return lcg_init(*rng + state->rng_offset + state->sample*scramble);
+	return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble);
 }
 
+
 ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
 {
-	/* implicit mod 2^32 */
+	/* Implicit mod 2^32 */
 	*rng = (1103515245*(*rng) + 12345);
 	return (float)*rng * (1.0f/(float)0xFFFFFFFF);
 }
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 9d5ea53d5d8..d46da189661 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -24,12 +24,12 @@
  *
  */
 
-#include "closure/alloc.h"
-#include "closure/bsdf_util.h"
-#include "closure/bsdf.h"
-#include "closure/emissive.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/bsdf.h"
+#include "kernel/closure/emissive.h"
 
-#include "svm/svm.h"
+#include "kernel/svm/svm.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -38,13 +38,13 @@ CCL_NAMESPACE_BEGIN
 #ifdef __OBJECT_MOTION__
 ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
 {
-	if(ccl_fetch(sd, flag) & SD_OBJECT_MOTION) {
-		ccl_fetch(sd, ob_tfm) = object_fetch_transform_motion(kg, ccl_fetch(sd, object), time);
-		ccl_fetch(sd, ob_itfm) = transform_quick_inverse(ccl_fetch(sd, ob_tfm));
+	if(sd->object_flag & SD_OBJECT_MOTION) {
+		sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
+		sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
 	}
 	else {
-		ccl_fetch(sd, ob_tfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
-		ccl_fetch(sd, ob_itfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+		sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+		sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	}
 }
 #endif
@@ -55,103 +55,104 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
                                                const Ray *ray)
 {
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
+	sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
 #endif
 
-	ccl_fetch(sd, type) = isect->type;
-	ccl_fetch(sd, flag) = kernel_tex_fetch(__object_flag, ccl_fetch(sd, object));
+	sd->type = isect->type;
+	sd->flag = 0;
+	sd->object_flag = kernel_tex_fetch(__object_flag,
+	                                              sd->object);
 
 	/* matrices and time */
 #ifdef __OBJECT_MOTION__
 	shader_setup_object_transforms(kg, sd, ray->time);
-	ccl_fetch(sd, time) = ray->time;
 #endif
+	sd->time = ray->time;
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_VOLUME) {
-		ccl_fetch(sd, prim) = isect->prim;
+	if (sd->type & PRIMITIVE_VOLUME) {
+		sd->prim = isect->prim;
 	}
 	else {
-		ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim);
+		sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
 	}
-
-	ccl_fetch(sd, ray_length) = isect->t;
+	sd->ray_length = isect->t;
 
 #ifdef __UV__
-	ccl_fetch(sd, u) = isect->u;
-	ccl_fetch(sd, v) = isect->v;
+	sd->u = isect->u;
+	sd->v = isect->v;
 #endif
 
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 		/* curve */
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
 
-		ccl_fetch(sd, shader) = __float_as_int(curvedata.z);
-		ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray);
+		sd->shader = __float_as_int(curvedata.z);
+		sd->P = curve_refine(kg, sd, isect, ray);
 	}
 	else
 #endif
-	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* static triangle */
 		float3 Ng = triangle_normal(kg, sd);
-		ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
+		sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
 
 		/* vectors */
-		ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray);
-		ccl_fetch(sd, Ng) = Ng;
-		ccl_fetch(sd, N) = Ng;
+		sd->P = triangle_refine(kg, sd, isect, ray);
+		sd->Ng = Ng;
+		sd->N = Ng;
 		
 		/* smooth normal */
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL)
-			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
+		if(sd->shader & SHADER_SMOOTH_NORMAL)
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
 #ifdef __DPDU__
 		/* dPdu/dPdv */
-		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
+		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 #endif
 	}
-	else if(ccl_fetch(sd, type) & PRIMITIVE_VOLUME) {
-		ccl_fetch(sd, shader) = kernel_tex_fetch(__vol_shader, ccl_fetch(sd, prim));
+	else if(sd->type & PRIMITIVE_VOLUME) {
+		sd->shader = kernel_tex_fetch(__vol_shader, sd->prim);
 	}
 	else {
 		/* motion triangle */
 		motion_triangle_shader_setup(kg, sd, isect, ray, false);
 	}
 
-	ccl_fetch(sd, I) = -ray->D;
+	sd->I = -ray->D;
 
-	ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
+	sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
 
 #ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
+		object_normal_transform_auto(kg, sd, &sd->N);
+		object_normal_transform_auto(kg, sd, &sd->Ng);
 #  ifdef __DPDU__
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
+		object_dir_transform_auto(kg, sd, &sd->dPdu);
+		object_dir_transform_auto(kg, sd, &sd->dPdv);
 #  endif
 	}
 #endif
 
 	/* backfacing test */
-	bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
+	bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
 
 	if(backfacing) {
-		ccl_fetch(sd, flag) |= SD_BACKFACING;
-		ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
-		ccl_fetch(sd, N) = -ccl_fetch(sd, N);
+		sd->flag |= SD_BACKFACING;
+		sd->Ng = -sd->Ng;
+		sd->N = -sd->N;
 #ifdef __DPDU__
-		ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
-		ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
+		sd->dPdu = -sd->dPdu;
+		sd->dPdv = -sd->dPdv;
 #endif
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t);
-	differential_incoming(&ccl_fetch(sd, dI), ray->dD);
-	differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng));
+	differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
+	differential_incoming(&sd->dI, ray->dD);
+	differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
 #endif
 }
 
@@ -169,10 +170,11 @@ void shader_setup_from_subsurface(
         const Intersection *isect,
         const Ray *ray)
 {
-	bool backfacing = sd->flag & SD_BACKFACING;
+	const bool backfacing = sd->flag & SD_BACKFACING;
 
 	/* object, matrices, time, ray_length stay the same */
-	sd->flag = kernel_tex_fetch(__object_flag, sd->object);
+	sd->flag = 0;
+	sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
 	sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
 	sd->type = isect->type;
 
@@ -192,7 +194,7 @@ void shader_setup_from_subsurface(
 		sd->N = Ng;
 
 		if(sd->shader & SHADER_SMOOTH_NORMAL)
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
 #  ifdef __DPDU__
 		/* dPdu/dPdv */
@@ -209,11 +211,11 @@ void shader_setup_from_subsurface(
 #  ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
-		object_normal_transform(kg, sd, &sd->N);
-		object_normal_transform(kg, sd, &sd->Ng);
+		object_normal_transform_auto(kg, sd, &sd->N);
+		object_normal_transform_auto(kg, sd, &sd->Ng);
 #    ifdef __DPDU__
-		object_dir_transform(kg, sd, &sd->dPdu);
-		object_dir_transform(kg, sd, &sd->dPdv);
+		object_dir_transform_auto(kg, sd, &sd->dPdu);
+		object_dir_transform_auto(kg, sd, &sd->dPdv);
 #    endif
 	}
 #  endif
@@ -255,104 +257,106 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
                                                 int lamp)
 {
 	/* vectors */
-	ccl_fetch(sd, P) = P;
-	ccl_fetch(sd, N) = Ng;
-	ccl_fetch(sd, Ng) = Ng;
-	ccl_fetch(sd, I) = I;
-	ccl_fetch(sd, shader) = shader;
+	sd->P = P;
+	sd->N = Ng;
+	sd->Ng = Ng;
+	sd->I = I;
+	sd->shader = shader;
 	if(prim != PRIM_NONE)
-		ccl_fetch(sd, type) = PRIMITIVE_TRIANGLE;
+		sd->type = PRIMITIVE_TRIANGLE;
 	else if(lamp != LAMP_NONE)
-		ccl_fetch(sd, type) = PRIMITIVE_LAMP;
+		sd->type = PRIMITIVE_LAMP;
 	else
-		ccl_fetch(sd, type) = PRIMITIVE_NONE;
+		sd->type = PRIMITIVE_NONE;
 
 	/* primitive */
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = object;
+	sd->object = object;
 #endif
 	/* currently no access to bvh prim index for strand sd->prim*/
-	ccl_fetch(sd, prim) = prim;
+	sd->prim = prim;
 #ifdef __UV__
-	ccl_fetch(sd, u) = u;
-	ccl_fetch(sd, v) = v;
+	sd->u = u;
+	sd->v = v;
 #endif
-	ccl_fetch(sd, ray_length) = t;
+	sd->time = time;
+	sd->ray_length = t;
 
-	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
-		ccl_fetch(sd, flag) |= kernel_tex_fetch(__object_flag, ccl_fetch(sd, object));
+	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
+	sd->object_flag = 0;
+	if(sd->object != OBJECT_NONE) {
+		sd->object_flag |= kernel_tex_fetch(__object_flag,
+		                                    sd->object);
 
 #ifdef __OBJECT_MOTION__
 		shader_setup_object_transforms(kg, sd, time);
-		ccl_fetch(sd, time) = time;
 	}
 	else if(lamp != LAMP_NONE) {
-		ccl_fetch(sd, ob_tfm)  = lamp_fetch_transform(kg, lamp, false);
-		ccl_fetch(sd, ob_itfm) = lamp_fetch_transform(kg, lamp, true);
+		sd->ob_tfm  = lamp_fetch_transform(kg, lamp, false);
+		sd->ob_itfm = lamp_fetch_transform(kg, lamp, true);
 #endif
 	}
 
 	/* transform into world space */
 	if(object_space) {
-		object_position_transform_auto(kg, sd, &ccl_fetch(sd, P));
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, I));
+		object_position_transform_auto(kg, sd, &sd->P);
+		object_normal_transform_auto(kg, sd, &sd->Ng);
+		sd->N = sd->Ng;
+		object_dir_transform_auto(kg, sd, &sd->I);
 	}
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* smooth normal */
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
-			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
+		if(sd->shader & SHADER_SMOOTH_NORMAL) {
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
 
 #ifdef __INSTANCING__
-			if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) {
-				object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
+			if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+				object_normal_transform_auto(kg, sd, &sd->N);
 			}
 #endif
 		}
 
 		/* dPdu/dPdv */
 #ifdef __DPDU__
-		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
+		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 
 #  ifdef __INSTANCING__
-		if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) {
-			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
-			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+			object_dir_transform_auto(kg, sd, &sd->dPdu);
+			object_dir_transform_auto(kg, sd, &sd->dPdv);
 		}
 #  endif
 #endif
 	}
 	else {
 #ifdef __DPDU__
-		ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
-		ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
+		sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
+		sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 	}
 
 	/* backfacing test */
-	if(ccl_fetch(sd, prim) != PRIM_NONE) {
-		bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
+	if(sd->prim != PRIM_NONE) {
+		bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
 
 		if(backfacing) {
-			ccl_fetch(sd, flag) |= SD_BACKFACING;
-			ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
-			ccl_fetch(sd, N) = -ccl_fetch(sd, N);
+			sd->flag |= SD_BACKFACING;
+			sd->Ng = -sd->Ng;
+			sd->N = -sd->N;
 #ifdef __DPDU__
-			ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
-			ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
+			sd->dPdu = -sd->dPdu;
+			sd->dPdv = -sd->dPdv;
 #endif
 		}
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* no ray differentials here yet */
-	ccl_fetch(sd, dP) = differential3_zero();
-	ccl_fetch(sd, dI) = differential3_zero();
-	ccl_fetch(sd, du) = differential_zero();
-	ccl_fetch(sd, dv) = differential_zero();
+	sd->dP = differential3_zero();
+	sd->dI = differential3_zero();
+	sd->du = differential_zero();
+	sd->dv = differential_zero();
 #endif
 }
 
@@ -373,7 +377,7 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 	                         P, Ng, I,
 	                         shader, object, prim,
 	                         u, v, 0.0f, 0.5f,
-	                         !(kernel_tex_fetch(__object_flag, object) & SD_TRANSFORM_APPLIED),
+	                         !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
 	                         LAMP_NONE);
 }
 
@@ -382,38 +386,37 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
 {
 	/* vectors */
-	ccl_fetch(sd, P) = ray->D;
-	ccl_fetch(sd, N) = -ray->D;
-	ccl_fetch(sd, Ng) = -ray->D;
-	ccl_fetch(sd, I) = -ray->D;
-	ccl_fetch(sd, shader) = kernel_data.background.surface_shader;
-	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
-#ifdef __OBJECT_MOTION__
-	ccl_fetch(sd, time) = ray->time;
-#endif
-	ccl_fetch(sd, ray_length) = 0.0f;
+	sd->P = ray->D;
+	sd->N = -ray->D;
+	sd->Ng = -ray->D;
+	sd->I = -ray->D;
+	sd->shader = kernel_data.background.surface_shader;
+	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
+	sd->object_flag = 0;
+	sd->time = ray->time;
+	sd->ray_length = 0.0f;
 
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = PRIM_NONE;
+	sd->object = PRIM_NONE;
 #endif
-	ccl_fetch(sd, prim) = PRIM_NONE;
+	sd->prim = PRIM_NONE;
 #ifdef __UV__
-	ccl_fetch(sd, u) = 0.0f;
-	ccl_fetch(sd, v) = 0.0f;
+	sd->u = 0.0f;
+	sd->v = 0.0f;
 #endif
 
 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
-	ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
+	sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
+	sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	ccl_fetch(sd, dP) = ray->dD;
-	differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP));
-	ccl_fetch(sd, du) = differential_zero();
-	ccl_fetch(sd, dv) = differential_zero();
+	sd->dP = ray->dD;
+	differential_incoming(&sd->dI, sd->dP);
+	sd->du = differential_zero();
+	sd->dv = differential_zero();
 #endif
 }
 
@@ -429,9 +432,8 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
 	sd->I = -ray->D;
 	sd->shader = SHADER_NONE;
 	sd->flag = 0;
-#ifdef __OBJECT_MOTION__
+	sd->object_flag = 0;
 	sd->time = ray->time;
-#endif
 	sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
 
 #ifdef __INSTANCING__
@@ -500,25 +502,50 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd)
 }
 #endif
 
+/* Defensive sampling. */
+
+ccl_device_inline void shader_prepare_closures(ShaderData *sd,
+                                               ccl_addr_space PathState *state)
+{
+	/* We can likely also do defensive sampling at deeper bounces, particularly
+	 * for cases like a perfect mirror but possibly also others. This will need
+	 * a good heuristic. */
+	if(state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) {
+		float sum = 0.0f;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			ShaderClosure *sc = &sd->closure[i];
+			if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+				sum += sc->sample_weight;
+			}
+		}
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			ShaderClosure *sc = &sd->closure[i];
+			if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+				sc->sample_weight = max(sc->sample_weight, 0.125f * sum);
+			}
+		}
+	}
+}
+
+
 /* BSDF */
 
 ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd, const float3 omega_in, float *pdf,
-	int skip_bsdf, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
+	const ShaderClosure *skip_sc, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
 {
 	/* this is the veach one-sample model with balance heuristic, some pdf
 	 * factors drop out when using balance heuristic weighting */
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		if(i == skip_bsdf)
-			continue;
-
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 
-		if(CLOSURE_IS_BSDF(sc->type)) {
+		if(sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
 			float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
 
 			if(bsdf_pdf != 0.0f) {
-				bsdf_eval_accum(result_eval, sc->type, eval*sc->weight);
+				bsdf_eval_accum(result_eval, sc->type, eval*sc->weight, 1.0f);
 				sum_pdf += bsdf_pdf*sc->sample_weight;
 			}
 
@@ -537,8 +564,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
                                                         float light_pdf,
                                                         bool use_mis)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 		if(CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
 			float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
@@ -546,7 +573,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
 				float mis_weight = use_mis? power_heuristic(light_pdf, bsdf_pdf): 1.0f;
 				bsdf_eval_accum(result_eval,
 				                sc->type,
-				                eval * sc->weight * mis_weight);
+				                eval * sc->weight,
+				                mis_weight);
 			}
 		}
 	}
@@ -575,56 +603,128 @@ void shader_bsdf_eval(KernelGlobals *kg,
 #endif
 	{
 		float pdf;
-		_shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, -1, eval, 0.0f, 0.0f);
+		_shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f);
 		if(use_mis) {
 			float weight = power_heuristic(light_pdf, pdf);
-			bsdf_eval_mul(eval, weight);
+			bsdf_eval_mis(eval, weight);
 		}
 	}
 }
 
-ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float randu, float randv,
-                                         BsdfEval *bsdf_eval,
-                                         float3 *omega_in,
-                                         differential3 *domega_in,
-                                         float *pdf)
+ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd,
+                                                        float *randu)
 {
 	int sampled = 0;
 
-	if(ccl_fetch(sd, num_closure) > 1) {
-		/* pick a BSDF closure based on sample weights */
+	if(sd->num_closure > 1) {
+		/* Pick a BSDF or based on sample weights. */
 		float sum = 0.0f;
 
-		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
-			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
-			
-			if(CLOSURE_IS_BSDF(sc->type))
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			if(CLOSURE_IS_BSDF(sc->type)) {
 				sum += sc->sample_weight;
+			}
 		}
 
-		float r = ccl_fetch(sd, randb_closure)*sum;
-		sum = 0.0f;
+		float r = (*randu)*sum;
+		float partial_sum = 0.0f;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
 
-		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
-			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
-			
 			if(CLOSURE_IS_BSDF(sc->type)) {
-				sum += sc->sample_weight;
+				float next_sum = partial_sum + sc->sample_weight;
+
+				if(r < next_sum) {
+					sampled = i;
 
-				if(r <= sum)
+					/* Rescale to reuse for direction sample, to better
+					 * preserve stratifaction. */
+					*randu = (r - partial_sum) / sc->sample_weight;
 					break;
+				}
+
+				partial_sum = next_sum;
 			}
 		}
+	}
 
-		if(sampled == ccl_fetch(sd, num_closure)) {
-			*pdf = 0.0f;
-			return LABEL_NONE;
+	return &sd->closure[sampled];
+}
+
+ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd,
+                                                          ccl_addr_space float3 *throughput,
+                                                          float *randu)
+{
+	int sampled = 0;
+
+	if(sd->num_closure > 1) {
+		/* Pick a BSDF or BSSRDF or based on sample weights. */
+		float sum_bsdf = 0.0f;
+		float sum_bssrdf = 0.0f;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			if(CLOSURE_IS_BSDF(sc->type)) {
+				sum_bsdf += sc->sample_weight;
+			}
+			else if(CLOSURE_IS_BSSRDF(sc->type)) {
+				sum_bssrdf += sc->sample_weight;
+			}
 		}
+
+		float r = (*randu)*(sum_bsdf + sum_bssrdf);
+		float partial_sum = 0.0f;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+				float next_sum = partial_sum + sc->sample_weight;
+
+				if(r < next_sum) {
+					if(CLOSURE_IS_BSDF(sc->type)) {
+						*throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf;
+						return NULL;
+					}
+					else {
+						*throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf;
+						sampled = i;
+
+						/* Rescale to reuse for direction sample, to better
+						 * preserve stratifaction. */
+						*randu = (r - partial_sum) / sc->sample_weight;
+						break;
+					}
+				}
+
+				partial_sum = next_sum;
+			}
+		}
+	}
+
+	return &sd->closure[sampled];
+}
+
+ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float randu, float randv,
+                                         BsdfEval *bsdf_eval,
+                                         float3 *omega_in,
+                                         differential3 *domega_in,
+                                         float *pdf)
+{
+	const ShaderClosure *sc = shader_bsdf_pick(sd, &randu);
+	if(sc == NULL) {
+		*pdf = 0.0f;
+		return LABEL_NONE;
 	}
 
-	const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+	/* BSSRDF should already have been handled elsewhere. */
+	kernel_assert(CLOSURE_IS_BSDF(sc->type));
 
 	int label;
 	float3 eval;
@@ -635,9 +735,9 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
 	if(*pdf != 0.0f) {
 		bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass);
 
-		if(ccl_fetch(sd, num_closure) > 1) {
+		if(sd->num_closure > 1) {
 			float sweight = sc->sample_weight;
-			_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight);
+			_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf*sweight, sweight);
 		}
 	}
 
@@ -662,23 +762,23 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF(sc->type))
 			bsdf_blur(kg, sc, roughness);
 	}
 }
 
-ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd)
 {
-	if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)
+	if(sd->flag & SD_HAS_ONLY_VOLUME)
 		return make_float3(1.0f, 1.0f, 1.0f);
 
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 
 		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl
 			eval += sc->weight;
@@ -687,6 +787,18 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 	return eval;
 }
 
+ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd)
+{
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) {
+			sc->sample_weight = 0.0f;
+			sc->weight = make_float3(0.0f, 0.0f, 0.0f);
+		}
+	}
+}
+
 ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 alpha = make_float3(1.0f, 1.0f, 1.0f) - shader_bsdf_transparency(kg, sd);
@@ -701,8 +813,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
 			eval += sc->weight;
@@ -715,8 +827,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
 			eval += sc->weight;
@@ -729,8 +841,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
 			eval += sc->weight;
@@ -743,8 +855,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type))
 			eval += sc->weight;
@@ -753,13 +865,26 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 	return eval;
 }
 
+ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
+{
+	float3 N = make_float3(0.0f, 0.0f, 0.0f);
+
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+		if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+			N += sc->N*average(sc->weight);
+	}
+
+	return (is_zero(N))? sd->N : normalize(N);
+}
+
 ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
 			const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc;
@@ -768,16 +893,11 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 		}
 		else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) {
 			eval += sc->weight;
-			N += ccl_fetch(sd, N)*average(sc->weight);
+			N += sd->N*average(sc->weight);
 		}
 	}
 
-	if(is_zero(N))
-		N = ccl_fetch(sd, N);
-	else
-		N = normalize(N);
-
-	*N_ = N;
+	*N_ = (is_zero(N))? sd->N : normalize(N);
 	return eval;
 }
 
@@ -788,8 +908,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 	float texture_blur = 0.0f, weight_sum = 0.0f;
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSSRDF(sc->type)) {
 			const Bssrdf *bssrdf = (const Bssrdf*)sc;
@@ -803,10 +923,10 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	}
 
 	if(N_)
-		*N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N);
+		*N_ = (is_zero(N))? sd->N: normalize(N);
 
 	if(texture_blur_)
-		*texture_blur_ = texture_blur/weight_sum;
+		*texture_blur_ = safe_divide(texture_blur, weight_sum);
 	
 	return eval;
 }
@@ -816,7 +936,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 
 ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc)
 {
-	return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I));
+	return emissive_simple_eval(sd->Ng, sd->I);
 }
 
 ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
@@ -824,8 +944,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
 	float3 eval;
 	eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_EMISSION(sc->type))
 			eval += emissive_eval(kg, sd, sc)*sc->weight;
@@ -840,8 +960,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 weight = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_HOLDOUT(sc->type))
 			weight += sc->weight;
@@ -852,16 +972,15 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 
 /* Surface Evaluation */
 
-ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_space RNG *rng,
-	ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx)
+ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
+	ccl_addr_space PathState *state, int path_flag)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = randb;
+	sd->num_closure = 0;
+	sd->num_closure_extra = 0;
 
 #ifdef __OSL__
 	if(kg->osl)
-		OSLShader::eval_surface(kg, sd, state, path_flag, ctx);
+		OSLShader::eval_surface(kg, sd, state, path_flag);
 	else
 #endif
 	{
@@ -871,29 +990,28 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_
 		DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd,
 		                                             sizeof(DiffuseBsdf),
 		                                             make_float3(0.8f, 0.8f, 0.8f));
-		bsdf->N = ccl_fetch(sd, N);
-		ccl_fetch(sd, flag) |= bsdf_diffuse_setup(bsdf);
+		bsdf->N = sd->N;
+		sd->flag |= bsdf_diffuse_setup(bsdf);
 #endif
 	}
 
-	if(rng && (ccl_fetch(sd, flag) & SD_BSDF_NEEDS_LCG)) {
-		ccl_fetch(sd, lcg_state) = lcg_state_init_addrspace(rng, state, 0xb4bc3953);
+	if(sd->flag & SD_BSDF_NEEDS_LCG) {
+		sd->lcg_state = lcg_state_init_addrspace(state, 0xb4bc3953);
 	}
 }
 
 /* Background Evaluation */
 
 ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
-	ccl_addr_space PathState *state, int path_flag, ShaderContext ctx)
+	ccl_addr_space PathState *state, int path_flag)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = 0.0f;
+	sd->num_closure = 0;
+	sd->num_closure_extra = 0;
 
 #ifdef __SVM__
 #ifdef __OSL__
 	if(kg->osl) {
-		OSLShader::eval_background(kg, sd, state, path_flag, ctx);
+		OSLShader::eval_background(kg, sd, state, path_flag);
 	}
 	else
 #endif
@@ -903,8 +1021,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
 
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BACKGROUND(sc->type))
 			eval += sc->weight;
@@ -934,7 +1052,7 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con
 			float3 eval = volume_phase_eval(sd, sc, omega_in, &phase_pdf);
 
 			if(phase_pdf != 0.0f) {
-				bsdf_eval_accum(result_eval, sc->type, eval);
+				bsdf_eval_accum(result_eval, sc->type, eval, 1.0f);
 				sum_pdf += phase_pdf*sc->sample_weight;
 			}
 
@@ -970,17 +1088,22 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, const ShaderData *s
 				sum += sc->sample_weight;
 		}
 
-		float r = sd->randb_closure*sum;
-		sum = 0.0f;
+		float r = randu*sum;
+		float partial_sum = 0.0f;
 
 		for(sampled = 0; sampled < sd->num_closure; sampled++) {
 			const ShaderClosure *sc = &sd->closure[sampled];
 			
 			if(CLOSURE_IS_PHASE(sc->type)) {
-				sum += sc->sample_weight;
+				float next_sum = partial_sum + sc->sample_weight;
 
-				if(r <= sum)
+				if(r <= next_sum) {
+					/* Rescale to reuse for BSDF direction sample. */
+					randu = (r - partial_sum) / sc->sample_weight;
 					break;
+				}
+
+				partial_sum = next_sum;
 			}
 		}
 
@@ -1026,16 +1149,16 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData *
 
 ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
                                           ShaderData *sd,
-                                          PathState *state,
-                                          VolumeStack *stack,
-                                          int path_flag,
-                                          ShaderContext ctx)
+                                          ccl_addr_space PathState *state,
+                                          ccl_addr_space VolumeStack *stack,
+                                          int path_flag)
 {
 	/* reset closures once at the start, we will be accumulating the closures
 	 * for all volumes in the stack into a single array of closures */
 	sd->num_closure = 0;
 	sd->num_closure_extra = 0;
 	sd->flag = 0;
+	sd->object_flag = 0;
 
 	for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
 		/* setup shaderdata from stack. it's mostly setup already in
@@ -1043,11 +1166,12 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 		sd->object = stack[i].object;
 		sd->shader = stack[i].shader;
 
-		sd->flag &= ~(SD_SHADER_FLAGS|SD_OBJECT_FLAGS);
+		sd->flag &= ~SD_SHADER_FLAGS;
 		sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
+		sd->object_flag &= ~SD_OBJECT_FLAGS;
 
 		if(sd->object != OBJECT_NONE) {
-			sd->flag |= kernel_tex_fetch(__object_flag, sd->object);
+			sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object);
 
 #ifdef __OBJECT_MOTION__
 			/* todo: this is inefficient for motion blur, we should be
@@ -1060,7 +1184,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 #ifdef __SVM__
 #  ifdef __OSL__
 		if(kg->osl) {
-			OSLShader::eval_volume(kg, sd, state, path_flag, ctx);
+			OSLShader::eval_volume(kg, sd, state, path_flag);
 		}
 		else
 #  endif
@@ -1079,17 +1203,16 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 
 /* Displacement Evaluation */
 
-ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx)
+ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = 0.0f;
+	sd->num_closure = 0;
+	sd->num_closure_extra = 0;
 
 	/* this will modify sd->P */
 #ifdef __SVM__
 #  ifdef __OSL__
 	if(kg->osl)
-		OSLShader::eval_displacement(kg, sd, ctx);
+		OSLShader::eval_displacement(kg, sd, state);
 	else
 #  endif
 	{
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 2981f6ac566..8a0da6c3b13 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -16,9 +16,118 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __SHADOW_RECORD_ALL__
+#ifdef __VOLUME__
+typedef struct VolumeState {
+#  ifdef __SPLIT_KERNEL__
+#  else
+	PathState ps;
+#  endif
+} VolumeState;
+
+/* Get PathState ready for use for volume stack evaluation. */
+#  ifdef __SPLIT_KERNEL__
+ccl_addr_space
+#  endif
+ccl_device_inline PathState *shadow_blocked_volume_path_state(
+        KernelGlobals *kg,
+        VolumeState *volume_state,
+        ccl_addr_space PathState *state,
+        ShaderData *sd,
+        Ray *ray)
+{
+#  ifdef __SPLIT_KERNEL__
+	ccl_addr_space PathState *ps =
+	        &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
+#  else
+	PathState *ps = &volume_state->ps;
+#  endif
+	*ps = *state;
+	/* We are checking for shadow on the "other" side of the surface, so need
+	 * to discard volume we are currently at.
+	 */
+	if(dot(sd->Ng, ray->D) < 0.0f) {
+		kernel_volume_stack_enter_exit(kg, sd, ps->volume_stack);
+	}
+	return ps;
+}
+#endif  /* __VOLUME__ */
+
+/* Attenuate throughput accordingly to the given intersection event.
+ * Returns true if the throughput is zero and traversal can be aborted.
+ */
+ccl_device_forceinline bool shadow_handle_transparent_isect(
+        KernelGlobals *kg,
+        ShaderData *shadow_sd,
+        ccl_addr_space PathState *state,
+#    ifdef __VOLUME__
+        ccl_addr_space struct PathState *volume_state,
+#    endif
+        Intersection *isect,
+        Ray *ray,
+        float3 *throughput)
+{
+#ifdef __VOLUME__
+	/* Attenuation between last surface and next surface. */
+	if(volume_state->volume_stack[0].shader != SHADER_NONE) {
+		Ray segment_ray = *ray;
+		segment_ray.t = isect->t;
+		kernel_volume_shadow(kg,
+		                     shadow_sd,
+		                     volume_state,
+		                     &segment_ray,
+		                     throughput);
+	}
+#endif
+	/* Setup shader data at surface. */
+	shader_setup_from_ray(kg, shadow_sd, isect, ray);
+	/* Attenuation from transparent surface. */
+	if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
+		path_state_modify_bounce(state, true);
+		shader_eval_surface(kg,
+		                    shadow_sd,
+		                    state,
+		                    PATH_RAY_SHADOW);
+		path_state_modify_bounce(state, false);
+		*throughput *= shader_bsdf_transparency(kg, shadow_sd);
+	}
+	/* Stop if all light is blocked. */
+	if(is_zero(*throughput)) {
+		return true;
+	}
+#ifdef __VOLUME__
+	/* Exit/enter volume. */
+	kernel_volume_stack_enter_exit(kg, shadow_sd, volume_state->volume_stack);
+#endif
+	return false;
+}
+
+/* Special version which only handles opaque shadows. */
+ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
+                                      ShaderData *shadow_sd,
+                                      ccl_addr_space PathState *state,
+                                      const uint visibility,
+                                      Ray *ray,
+                                      Intersection *isect,
+                                      float3 *shadow)
+{
+	const bool blocked = scene_intersect(kg,
+	                                     *ray,
+	                                     visibility & PATH_RAY_SHADOW_OPAQUE,
+	                                     isect,
+	                                     NULL,
+	                                     0.0f, 0.0f);
+#ifdef __VOLUME__
+	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
+		/* Apply attenuation from current volume shader. */
+		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
+	}
+#endif
+	return blocked;
+}
 
-/* Shadow function to compute how much light is blocked, CPU variation.
+#ifdef __TRANSPARENT_SHADOWS__
+#  ifdef __SHADOW_RECORD_ALL__
+/* Shadow function to compute how much light is blocked,
  *
  * We trace a single ray. If it hits any opaque surface, or more than a given
  * number of transparent surfaces is hit, then we consider the geometry to be
@@ -36,261 +145,412 @@ CCL_NAMESPACE_BEGIN
  * or there is a performance increase anyway due to avoiding the need to send
  * two rays with transparent shadows.
  *
- * This is CPU only because of qsort, and malloc or high stack space usage to
- * record all these intersections. */
+ * On CPU it'll handle all transparent bounces (by allocating storage for
+ * intersections when they don't fit into the stack storage).
+ *
+ * On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this
+ * is something to be kept an eye on.
+ */
 
-#define STACK_MAX_HITS 64
+#    define SHADOW_STACK_MAX_HITS 64
 
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *shadow)
+/* Actual logic with traversal loop implementation which is free from device
+ * specific tweaks.
+ *
+ * Note that hits array should be as big as max_hits+1.
+ */
+ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
+                                                    ShaderData *sd,
+                                                    ShaderData *shadow_sd,
+                                                    ccl_addr_space PathState *state,
+                                                    const uint visibility,
+                                                    Ray *ray,
+                                                    Intersection *hits,
+                                                    uint max_hits,
+                                                    float3 *shadow)
 {
-	*shadow = make_float3(1.0f, 1.0f, 1.0f);
-
-	if(ray->t == 0.0f)
-		return false;
-	
-	bool blocked;
-
-	if(kernel_data.integrator.transparent_shadows) {
-		/* check transparent bounces here, for volume scatter which can do
-		 * lighting before surface path termination is checked */
-		if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce)
-			return true;
-
-		/* intersect to find an opaque surface, or record all transparent surface hits */
-		Intersection hits_stack[STACK_MAX_HITS];
-		Intersection *hits = hits_stack;
-		const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
-		uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
-
-		/* prefer to use stack but use dynamic allocation if too deep max hits
-		 * we need max_hits + 1 storage space due to the logic in
-		 * scene_intersect_shadow_all which will first store and then check if
-		 * the limit is exceeded */
-		if(max_hits + 1 > STACK_MAX_HITS) {
-			if(kg->transparent_shadow_intersections == NULL) {
-				kg->transparent_shadow_intersections =
-				    (Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
+	/* Intersect to find an opaque surface, or record all transparent
+	 * surface hits.
+	 */
+	uint num_hits;
+	const bool blocked = scene_intersect_shadow_all(kg,
+	                                                ray,
+	                                                hits,
+	                                                visibility,
+	                                                max_hits,
+	                                                &num_hits);
+#    ifdef __VOLUME__
+	VolumeState volume_state;
+#    endif
+	/* If no opaque surface found but we did find transparent hits,
+	 * shade them.
+	 */
+	if(!blocked && num_hits > 0) {
+		float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+		float3 Pend = ray->P + ray->D*ray->t;
+		float last_t = 0.0f;
+		int bounce = state->transparent_bounce;
+		Intersection *isect = hits;
+#    ifdef __VOLUME__
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space
+#      endif
+		PathState *ps = shadow_blocked_volume_path_state(kg,
+		                                                 &volume_state,
+		                                                 state,
+		                                                 sd,
+		                                                 ray);
+#    endif
+		sort_intersections(hits, num_hits);
+		for(int hit = 0; hit < num_hits; hit++, isect++) {
+			/* Adjust intersection distance for moving ray forward. */
+			float new_t = isect->t;
+			isect->t -= last_t;
+			/* Skip hit if we did not move forward, step by step raytracing
+			 * would have skipped it as well then.
+			 */
+			if(last_t == new_t) {
+				continue;
 			}
-			hits = kg->transparent_shadow_intersections;
-		}
-
-		uint num_hits;
-		blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits);
-
-		/* if no opaque surface found but we did find transparent hits, shade them */
-		if(!blocked && num_hits > 0) {
-			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-			float3 Pend = ray->P + ray->D*ray->t;
-			float last_t = 0.0f;
-			int bounce = state->transparent_bounce;
-			Intersection *isect = hits;
-#ifdef __VOLUME__
-			PathState ps = *state;
-#endif
-
-			qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-
-			for(int hit = 0; hit < num_hits; hit++, isect++) {
-				/* adjust intersection distance for moving ray forward */
-				float new_t = isect->t;
-				isect->t -= last_t;
-
-				/* skip hit if we did not move forward, step by step raytracing
-				 * would have skipped it as well then */
-				if(last_t == new_t)
-					continue;
-
-				last_t = new_t;
-
-#ifdef __VOLUME__
-				/* attenuation between last surface and next surface */
-				if(ps.volume_stack[0].shader != SHADER_NONE) {
-					Ray segment_ray = *ray;
-					segment_ray.t = isect->t;
-					kernel_volume_shadow(kg, shadow_sd, &ps, &segment_ray, &throughput);
-				}
-#endif
-
-				/* setup shader data at surface */
-				shader_setup_from_ray(kg, shadow_sd, isect, ray);
-
-				/* attenuation from transparent surface */
-				if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
-					path_state_modify_bounce(state, true);
-					shader_eval_surface(kg, shadow_sd, NULL, state, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
-					path_state_modify_bounce(state, false);
-
-					throughput *= shader_bsdf_transparency(kg, shadow_sd);
-				}
-
-				/* stop if all light is blocked */
-				if(is_zero(throughput)) {
-					return true;
-				}
-
-				/* move ray forward */
-				ray->P = shadow_sd->P;
-				if(ray->t != FLT_MAX) {
-					ray->D = normalize_len(Pend - ray->P, &ray->t);
-				}
-
+			last_t = new_t;
+			/* Attenuate the throughput. */
+			if(shadow_handle_transparent_isect(kg,
+			                                   shadow_sd,
+			                                   state,
 #ifdef __VOLUME__
-				/* exit/enter volume */
-				kernel_volume_stack_enter_exit(kg, shadow_sd, ps.volume_stack);
+			                                   ps,
 #endif
-
-				bounce++;
+			                                   isect,
+			                                   ray,
+			                                   &throughput))
+			{
+				return true;
 			}
-
-#ifdef __VOLUME__
-			/* attenuation for last line segment towards light */
-			if(ps.volume_stack[0].shader != SHADER_NONE)
-				kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput);
-#endif
-
-			*shadow = throughput;
-
-			return is_zero(throughput);
+			/* Move ray forward. */
+			ray->P = shadow_sd->P;
+			if(ray->t != FLT_MAX) {
+				ray->D = normalize_len(Pend - ray->P, &ray->t);
+			}
+			bounce++;
 		}
+#    ifdef __VOLUME__
+		/* Attenuation for last line segment towards light. */
+		if(ps->volume_stack[0].shader != SHADER_NONE) {
+			kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
+		}
+#    endif
+		*shadow = throughput;
+		return is_zero(throughput);
 	}
-	else {
-		Intersection isect;
-		blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
-	}
-
-#ifdef __VOLUME__
+#    ifdef __VOLUME__
 	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-		/* apply attenuation from current volume shader */
-		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
+		/* Apply attenuation from current volume shader. */
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space
+#      endif
+		PathState *ps = shadow_blocked_volume_path_state(kg,
+		                                                 &volume_state,
+		                                                 state,
+		                                                 sd,
+		                                                 ray);
+		kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
 	}
-#endif
-
+#    endif
 	return blocked;
 }
 
-#undef STACK_MAX_HITS
-
-#else
+/* Here we do all device specific trickery before invoking actual traversal
+ * loop to help readability of the actual logic.
+ */
+ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               ShaderData *shadow_sd,
+                                               ccl_addr_space PathState *state,
+                                               const uint visibility,
+                                               Ray *ray,
+                                               uint max_hits,
+                                               float3 *shadow)
+{
+#    ifdef __SPLIT_KERNEL__
+	Intersection hits_[SHADOW_STACK_MAX_HITS];
+	Intersection *hits = &hits_[0];
+#    elif defined(__KERNEL_CUDA__)
+	Intersection *hits = kg->hits_stack;
+#    else
+	Intersection hits_stack[SHADOW_STACK_MAX_HITS];
+	Intersection *hits = hits_stack;
+#    endif
+#    ifndef __KERNEL_GPU__
+	/* Prefer to use stack but use dynamic allocation if too deep max hits
+	 * we need max_hits + 1 storage space due to the logic in
+	 * scene_intersect_shadow_all which will first store and then check if
+	 * the limit is exceeded.
+	 *
+	 * Ignore this on GPU because of slow/unavailable malloc().
+	 */
+	if(max_hits + 1 > SHADOW_STACK_MAX_HITS) {
+		if(kg->transparent_shadow_intersections == NULL) {
+			const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+			kg->transparent_shadow_intersections =
+				(Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
+		}
+		hits = kg->transparent_shadow_intersections;
+	}
+#    endif  /* __KERNEL_GPU__ */
+	/* Invoke actual traversal. */
+	return shadow_blocked_transparent_all_loop(kg,
+	                                           sd,
+	                                           shadow_sd,
+	                                           state,
+	                                           visibility,
+	                                           ray,
+	                                           hits,
+	                                           max_hits,
+	                                           shadow);
+}
+#  endif  /* __SHADOW_RECORD_ALL__ */
 
-/* Shadow function to compute how much light is blocked, GPU variation.
+#  if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__)
+/* Shadow function to compute how much light is blocked,
  *
  * Here we raytrace from one transparent surface to the next step by step.
  * To minimize overhead in cases where we don't need transparent shadows, we
  * first trace a regular shadow ray. We check if the hit primitive was
  * potentially transparent, and only in that case start marching. this gives
- * one extra ray cast for the cases were we do want transparency. */
+ * one extra ray cast for the cases were we do want transparency.
+ */
 
-ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
-                                        ShaderData *shadow_sd,
-                                        ccl_addr_space PathState *state,
-                                        ccl_addr_space Ray *ray_input,
-                                        float3 *shadow)
+/* This function is only implementing device-independent traversal logic
+ * which requires some precalculation done.
+ */
+ccl_device bool shadow_blocked_transparent_stepped_loop(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        ShaderData *shadow_sd,
+        ccl_addr_space PathState *state,
+        const uint visibility,
+        Ray *ray,
+        Intersection *isect,
+        const bool blocked,
+        const bool is_transparent_isect,
+        float3 *shadow)
 {
-	*shadow = make_float3(1.0f, 1.0f, 1.0f);
-
-	if(ray_input->t == 0.0f)
-		return false;
-
-#ifdef __SPLIT_KERNEL__
-	Ray private_ray = *ray_input;
-	Ray *ray = &private_ray;
-#else
-	Ray *ray = ray_input;
-#endif
-
-#ifdef __SPLIT_KERNEL__
-	Intersection *isect = &kg->isect_shadow[SD_THREAD];
-#else
-	Intersection isect_object;
-	Intersection *isect = &isect_object;
-#endif
-
-	bool blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
-
-#ifdef __TRANSPARENT_SHADOWS__
-	if(blocked && kernel_data.integrator.transparent_shadows) {
-		if(shader_transparent_shadow(kg, isect)) {
-			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-			float3 Pend = ray->P + ray->D*ray->t;
-			int bounce = state->transparent_bounce;
-#ifdef __VOLUME__
-			PathState ps = *state;
-#endif
-
-			for(;;) {
-				if(bounce >= kernel_data.integrator.transparent_max_bounce)
-					return true;
-
-				if(!scene_intersect(kg, *ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
-				{
+#    ifdef __VOLUME__
+	VolumeState volume_state;
+#    endif
+	if(blocked && is_transparent_isect) {
+		float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+		float3 Pend = ray->P + ray->D*ray->t;
+		int bounce = state->transparent_bounce;
+#    ifdef __VOLUME__
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space
+#      endif
+		PathState *ps = shadow_blocked_volume_path_state(kg,
+		                                                 &volume_state,
+		                                                 state,
+		                                                 sd,
+		                                                 ray);
+#    endif
+		for(;;) {
+			if(bounce >= kernel_data.integrator.transparent_max_bounce) {
+				return true;
+			}
+			if(!scene_intersect(kg,
+			                    *ray,
+			                    visibility & PATH_RAY_SHADOW_TRANSPARENT,
+			                    isect,
+			                    NULL,
+			                    0.0f, 0.0f))
+			{
+				break;
+			}
+			if(!shader_transparent_shadow(kg, isect)) {
+				return true;
+			}
+			/* Attenuate the throughput. */
+			if(shadow_handle_transparent_isect(kg,
+			                                   shadow_sd,
+			                                   state,
 #ifdef __VOLUME__
-					/* attenuation for last line segment towards light */
-					if(ps.volume_stack[0].shader != SHADER_NONE)
-						kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput);
+			                                   ps,
 #endif
+			                                   isect,
+			                                   ray,
+			                                   &throughput))
+			{
+				return true;
+			}
+			/* Move ray forward. */
+			ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
+			if(ray->t != FLT_MAX) {
+				ray->D = normalize_len(Pend - ray->P, &ray->t);
+			}
+			bounce++;
+		}
+#    ifdef __VOLUME__
+		/* Attenuation for last line segment towards light. */
+		if(ps->volume_stack[0].shader != SHADER_NONE) {
+			kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
+		}
+#    endif
+		*shadow *= throughput;
+		return is_zero(throughput);
+	}
+#    ifdef __VOLUME__
+	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
+		/* Apply attenuation from current volume shader. */
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space
+#      endif
+		PathState *ps = shadow_blocked_volume_path_state(kg,
+		                                                 &volume_state,
+		                                                 state,
+		                                                 sd,
+		                                                 ray);
+		kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
+	}
+#    endif
+	return blocked;
+}
 
-					*shadow *= throughput;
-
-					return false;
-				}
+ccl_device bool shadow_blocked_transparent_stepped(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        ShaderData *shadow_sd,
+        ccl_addr_space PathState *state,
+        const uint visibility,
+        Ray *ray,
+        Intersection *isect,
+        float3 *shadow)
+{
+	bool blocked = scene_intersect(kg,
+	                               *ray,
+	                               visibility & PATH_RAY_SHADOW_OPAQUE,
+	                               isect,
+	                               NULL,
+	                               0.0f, 0.0f);
+	bool is_transparent_isect = blocked
+		? shader_transparent_shadow(kg, isect)
+		: false;
+	return shadow_blocked_transparent_stepped_loop(kg,
+	                                               sd,
+	                                               shadow_sd,
+	                                               state,
+	                                               visibility,
+	                                               ray,
+	                                               isect,
+	                                               blocked,
+	                                               is_transparent_isect,
+	                                               shadow);
+}
 
-				if(!shader_transparent_shadow(kg, isect)) {
-					return true;
-				}
+#  endif  /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */
+#endif /* __TRANSPARENT_SHADOWS__ */
 
-#ifdef __VOLUME__
-				/* attenuation between last surface and next surface */
-				if(ps.volume_stack[0].shader != SHADER_NONE) {
-					Ray segment_ray = *ray;
-					segment_ray.t = isect->t;
-					kernel_volume_shadow(kg, shadow_sd, &ps, &segment_ray, &throughput);
-				}
+ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      ShaderData *shadow_sd,
+                                      ccl_addr_space PathState *state,
+                                      Ray *ray_input,
+                                      float3 *shadow)
+{
+	Ray *ray = ray_input;
+	Intersection isect;
+	/* Some common early checks. */
+	*shadow = make_float3(1.0f, 1.0f, 1.0f);
+	if(ray->t == 0.0f) {
+		return false;
+	}
+#ifdef __SHADOW_TRICKS__
+	const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER)
+		? PATH_RAY_SHADOW_NON_CATCHER
+		: PATH_RAY_SHADOW;
+#else
+	const uint visibility = PATH_RAY_SHADOW;
 #endif
-
-				/* setup shader data at surface */
-				shader_setup_from_ray(kg, shadow_sd, isect, ray);
-
-				/* attenuation from transparent surface */
-				if(!(ccl_fetch(shadow_sd, flag) & SD_HAS_ONLY_VOLUME)) {
-					path_state_modify_bounce(state, true);
-					shader_eval_surface(kg, shadow_sd, NULL, state, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
-					path_state_modify_bounce(state, false);
-
-					throughput *= shader_bsdf_transparency(kg, shadow_sd);
-				}
-
-				/* stop if all light is blocked */
-				if(is_zero(throughput)) {
-					return true;
-				}
-
-				/* move ray forward */
-				ray->P = ray_offset(ccl_fetch(shadow_sd, P), -ccl_fetch(shadow_sd, Ng));
-				if(ray->t != FLT_MAX) {
-					ray->D = normalize_len(Pend - ray->P, &ray->t);
-				}
-
-#ifdef __VOLUME__
-				/* exit/enter volume */
-				kernel_volume_stack_enter_exit(kg, shadow_sd, ps.volume_stack);
+	/* Do actual shadow shading. */
+	/* First of all, we check if integrator requires transparent shadows.
+	 * if not, we use simplest and fastest ever way to calculate occlusion.
+	 */
+#ifdef __TRANSPARENT_SHADOWS__
+	if(!kernel_data.integrator.transparent_shadows)
 #endif
-
-				bounce++;
-			}
-		}
+	{
+		return shadow_blocked_opaque(kg,
+		                             shadow_sd,
+		                             state,
+		                             visibility,
+		                             ray,
+		                             &isect,
+		                             shadow);
 	}
-#ifdef __VOLUME__
-	else if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-		/* apply attenuation from current volume shader */
-		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
+#ifdef __TRANSPARENT_SHADOWS__
+#  ifdef __SHADOW_RECORD_ALL__
+	/* For the transparent shadows we try to use record-all logic on the
+	 * devices which supports this.
+	 */
+	const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+	/* Check transparent bounces here, for volume scatter which can do
+	 * lighting before surface path termination is checked.
+	 */
+	if(state->transparent_bounce >= transparent_max_bounce) {
+		return true;
 	}
-#endif
-#endif
-
-	return blocked;
+	const uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
+#    ifdef __KERNEL_GPU__
+	/* On GPU we do trickey with tracing opaque ray first, this avoids speed
+	 * regressions in some files.
+	 *
+	 * TODO(sergey): Check why using record-all behavior causes slowdown in such
+	 * cases. Could that be caused by a higher spill pressure?
+	 */
+	const bool blocked = scene_intersect(kg,
+	                                     *ray,
+	                                     visibility & PATH_RAY_SHADOW_OPAQUE,
+	                                     &isect,
+	                                     NULL,
+	                                     0.0f, 0.0f);
+	const bool is_transparent_isect = blocked
+	        ? shader_transparent_shadow(kg, &isect)
+	        : false;
+	if(!blocked || !is_transparent_isect ||
+	   max_hits + 1 >= SHADOW_STACK_MAX_HITS)
+	{
+		return shadow_blocked_transparent_stepped_loop(kg,
+		                                               sd,
+		                                               shadow_sd,
+		                                               state,
+		                                               visibility,
+		                                               ray,
+		                                               &isect,
+		                                               blocked,
+		                                               is_transparent_isect,
+		                                               shadow);
+	}
+#    endif  /* __KERNEL_GPU__ */
+	return shadow_blocked_transparent_all(kg,
+	                                      sd,
+	                                      shadow_sd,
+	                                      state,
+	                                      visibility,
+	                                      ray,
+	                                      max_hits,
+	                                      shadow);
+#  else  /* __SHADOW_RECORD_ALL__ */
+	/* Fallback to a slowest version which works on all devices. */
+	return shadow_blocked_transparent_stepped(kg,
+	                                          sd,
+	                                          shadow_sd,
+	                                          state,
+	                                          visibility,
+	                                          ray,
+	                                          &isect,
+	                                          shadow);
+#  endif  /* __SHADOW_RECORD_ALL__ */
+#endif  /* __TRANSPARENT_SHADOWS__ */
 }
 
-#endif
+#undef SHADOW_STACK_MAX_HITS
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 52c05b85aee..23a09e5e2ca 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -28,87 +28,31 @@ CCL_NAMESPACE_BEGIN
  * - try to reduce one sample model variance
  */
 
-#define BSSRDF_MULTI_EVAL
-
-ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, ShaderData *sd, float *probability)
-{
-	/* sum sample weights of bssrdf and bsdf */
-	float bsdf_sum = 0.0f;
-	float bssrdf_sum = 0.0f;
-
-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
-		
-		if(CLOSURE_IS_BSDF(sc->type))
-			bsdf_sum += sc->sample_weight;
-		else if(CLOSURE_IS_BSSRDF(sc->type))
-			bssrdf_sum += sc->sample_weight;
-	}
-
-	/* use bsdf or bssrdf? */
-	float r = sd->randb_closure*(bsdf_sum + bssrdf_sum);
-
-	if(r < bsdf_sum) {
-		/* use bsdf, and adjust randb so we can reuse it for picking a bsdf */
-		sd->randb_closure = r/bsdf_sum;
-		*probability = (bsdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bsdf_sum: 1.0f;
-		return NULL;
-	}
-
-	/* use bssrdf */
-	r -= bsdf_sum;
-
-	float sum = 0.0f;
-
-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
-		
-		if(CLOSURE_IS_BSSRDF(sc->type)) {
-			sum += sc->sample_weight;
-
-			if(r <= sum) {
-				sd->randb_closure = (r - (sum - sc->sample_weight))/sc->sample_weight;
-
-#ifdef BSSRDF_MULTI_EVAL
-				*probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bssrdf_sum: 1.0f;
-#else
-				*probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/sc->sample_weight: 1.0f;
-#endif
-				return sc;
-			}
-		}
-	}
-
-	/* should never happen */
-	sd->randb_closure = 0.0f;
-	*probability = 1.0f;
-	return NULL;
-}
-
 ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
-                                                 ShaderClosure *sc,
+                                                 const ShaderClosure *sc,
                                                  float disk_r,
                                                  float r,
                                                  bool all)
 {
-#ifdef BSSRDF_MULTI_EVAL
 	/* this is the veach one-sample model with balance heuristic, some pdf
 	 * factors drop out when using balance heuristic weighting */
 	float3 eval_sum = make_float3(0.0f, 0.0f, 0.0f);
 	float pdf_sum = 0.0f;
-	float sample_weight_sum = 0.0f;
-	int num_bssrdf = 0;
+	float sample_weight_inv = 0.0f;
 
-	for(int i = 0; i < sd->num_closure; i++) {
-		sc = &sd->closure[i];
-		
-		if(CLOSURE_IS_BSSRDF(sc->type)) {
-			float sample_weight = (all)? 1.0f: sc->sample_weight;
-			sample_weight_sum += sample_weight;
+	if(!all) {
+		float sample_weight_sum = 0.0f;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			sc = &sd->closure[i];
+
+			if(CLOSURE_IS_BSSRDF(sc->type)) {
+				sample_weight_sum += sc->sample_weight;
+			}
 		}
-	}
 
-	float sample_weight_inv = 1.0f/sample_weight_sum;
+		sample_weight_inv = 1.0f/sample_weight_sum;
+	}
 
 	for(int i = 0; i < sd->num_closure; i++) {
 		sc = &sd->closure[i];
@@ -125,38 +69,49 @@ ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
 			/* TODO power heuristic is not working correct here */
 			eval_sum += sc->weight*pdf; //*sample_weight*disk_pdf;
 			pdf_sum += sample_weight*disk_pdf; //*sample_weight*disk_pdf;
-
-			num_bssrdf++;
 		}
 	}
 
 	return (pdf_sum > 0.0f)? eval_sum / pdf_sum : make_float3(0.0f, 0.0f, 0.0f);
-#else
-	float pdf = bssrdf_pdf(pick_sc, r);
-	float disk_pdf = bssrdf_pdf(pick_sc, disk_r);
-
-	return pick_sc->weight * pdf / disk_pdf;
-#endif
 }
 
 /* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, float3 weight, bool hit, float3 N)
+ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, const ShaderClosure *sc, float3 weight, bool hit, float3 N)
 {
 	sd->flag &= ~SD_CLOSURE_FLAGS;
-	sd->randb_closure = 0.0f;
 	sd->num_closure = 0;
 	sd->num_closure_extra = 0;
 
 	if(hit) {
-		DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
-
-		if(bsdf) {
-			bsdf->N = N;
-			sd->flag |= bsdf_diffuse_setup(bsdf);
-
-			/* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
-			 * can recognize it as not being a regular diffuse closure */
-			bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+		Bssrdf *bssrdf = (Bssrdf *)sc;
+#ifdef __PRINCIPLED__
+		if(bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID) {
+			PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), weight);
+
+			if(bsdf) {
+				bsdf->N = N;
+				bsdf->roughness = bssrdf->roughness;
+				sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+
+				/* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
+				 * can recognize it as not being a regular Disney principled diffuse closure */
+				bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+			}
+		}
+		else if(CLOSURE_IS_BSDF_BSSRDF(bssrdf->type) ||
+		        CLOSURE_IS_BSSRDF(bssrdf->type))
+#endif  /* __PRINCIPLED__ */
+		{
+			DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
+
+			if(bsdf) {
+				bsdf->N = N;
+				sd->flag |= bsdf_diffuse_setup(bsdf);
+
+				/* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
+				 * can recognize it as not being a regular diffuse closure */
+				bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+			}
 		}
 	}
 }
@@ -185,7 +140,7 @@ ccl_device float3 subsurface_color_pow(float3 color, float exponent)
 
 ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
                                            ShaderData *sd,
-                                           PathState *state,
+                                           ccl_addr_space PathState *state,
                                            int state_flag,
                                            float3 *eval,
                                            float3 *N)
@@ -199,7 +154,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
 
 	if(bump || texture_blur > 0.0f) {
 		/* average color and normal at incoming point */
-		shader_eval_surface(kg, sd, NULL, state, 0.0f, state_flag, SHADER_CONTEXT_SSS);
+		shader_eval_surface(kg, sd, state, state_flag);
 		float3 in_color = shader_bssrdf_sum(sd, (bump)? N: NULL, NULL);
 
 		/* we simply divide out the average color and multiply with the average
@@ -222,7 +177,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
         KernelGlobals *kg,
         SubsurfaceIntersection *ss_isect,
         ShaderData *sd,
-        ShaderClosure *sc,
+        const ShaderClosure *sc,
         uint *lcg_state,
         float disk_u,
         float disk_v,
@@ -235,26 +190,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 	disk_N = sd->Ng;
 	make_orthonormals(disk_N, &disk_T, &disk_B);
 
-	/* reusing variable for picking the closure gives a bit nicer stratification
-	 * for path tracer, for branched we do all closures so it doesn't help */
-	float axisu = (all)? disk_u: sd->randb_closure;
-
-	if(axisu < 0.5f) {
+	if(disk_u < 0.5f) {
 		pick_pdf_N = 0.5f;
 		pick_pdf_T = 0.25f;
 		pick_pdf_B = 0.25f;
-		if(all)
-			disk_u *= 2.0f;
+		disk_u *= 2.0f;
 	}
-	else if(axisu < 0.75f) {
+	else if(disk_u < 0.75f) {
 		float3 tmp = disk_N;
 		disk_N = disk_T;
 		disk_T = tmp;
 		pick_pdf_N = 0.25f;
 		pick_pdf_T = 0.5f;
 		pick_pdf_B = 0.25f;
-		if(all)
-			disk_u = (disk_u - 0.5f)*4.0f;
+		disk_u = (disk_u - 0.5f)*4.0f;
 	}
 	else {
 		float3 tmp = disk_N;
@@ -263,8 +212,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 		pick_pdf_N = 0.25f;
 		pick_pdf_T = 0.25f;
 		pick_pdf_B = 0.5f;
-		if(all)
-			disk_u = (disk_u - 0.75f)*4.0f;
+		disk_u = (disk_u - 0.75f)*4.0f;
 	}
 
 	/* sample point on disk */
@@ -277,7 +225,12 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 	float3 disk_P = (disk_r*cosf(phi)) * disk_T + (disk_r*sinf(phi)) * disk_B;
 
 	/* create ray */
+#ifdef __SPLIT_KERNEL__
+	Ray ray_object = ss_isect->ray;
+	Ray *ray = &ray_object;
+#else
 	Ray *ray = &ss_isect->ray;
+#endif
 	ray->P = sd->P + disk_N*disk_height + disk_P;
 	ray->D = -disk_N;
 	ray->t = 2.0f*disk_height;
@@ -288,7 +241,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 	/* intersect with the same object. if multiple intersections are found it
 	 * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */
 	scene_intersect_subsurface(kg,
-	                           ray,
+	                           *ray,
 	                           ss_isect,
 	                           sd->object,
 	                           lcg_state,
@@ -298,20 +251,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 	for(int hit = 0; hit < num_eval_hits; hit++) {
 		/* Quickly retrieve P and Ng without setting up ShaderData. */
 		float3 hit_P;
-		if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+		if(sd->type & PRIMITIVE_TRIANGLE) {
 			hit_P = triangle_refine_subsurface(kg,
 			                                   sd,
 			                                   &ss_isect->hits[hit],
 			                                   ray);
 		}
 #ifdef __OBJECT_MOTION__
-		else  if(ccl_fetch(sd, type) & PRIMITIVE_MOTION_TRIANGLE) {
+		else  if(sd->type & PRIMITIVE_MOTION_TRIANGLE) {
 			float3 verts[3];
 			motion_triangle_vertices(
 			        kg,
-			        ccl_fetch(sd, object),
+			        sd->object,
 			        kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim),
-			        ccl_fetch(sd, time),
+			        sd->time,
 			        verts);
 			hit_P = motion_triangle_refine_subsurface(kg,
 			                                          sd,
@@ -351,6 +304,10 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 		ss_isect->weight[hit] = eval;
 	}
 
+#ifdef __SPLIT_KERNEL__
+	ss_isect->ray = *ray;
+#endif
+
 	return num_eval_hits;
 }
 
@@ -359,13 +316,25 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
         SubsurfaceIntersection* ss_isect,
         int hit,
         ShaderData *sd,
-        PathState *state,
+        ccl_addr_space PathState *state,
         int state_flag,
-        ShaderClosure *sc,
+        const ShaderClosure *sc,
         bool all)
 {
+#ifdef __SPLIT_KERNEL__
+	Ray ray_object = ss_isect->ray;
+	Ray *ray = &ray_object;
+#else
+	Ray *ray = &ss_isect->ray;
+#endif
+
+	/* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */
+#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__)
+	kernel_split_params.dummy_sd_flag = sd->flag;
+#endif
+
 	/* Setup new shading point. */
-	shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], &ss_isect->ray);
+	shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray);
 
 	/* Optionally blur colors and bump mapping. */
 	float3 weight = ss_isect->weight[hit];
@@ -373,12 +342,12 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
 	subsurface_color_bump_blur(kg, sd, state, state_flag, &weight, &N);
 
 	/* Setup diffuse BSDF. */
-	subsurface_scatter_setup_diffuse_bsdf(sd, weight, true, N);
+	subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N);
 }
 
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
-ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state,
-	int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
+ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state,
+	int state_flag, const ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
@@ -389,18 +358,20 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
 	disk_N = sd->Ng;
 	make_orthonormals(disk_N, &disk_T, &disk_B);
 
-	if(sd->randb_closure < 0.5f) {
+	if(disk_u < 0.5f) {
 		pick_pdf_N = 0.5f;
 		pick_pdf_T = 0.25f;
 		pick_pdf_B = 0.25f;
+		disk_u *= 2.0f;
 	}
-	else if(sd->randb_closure < 0.75f) {
+	else if(disk_u < 0.75f) {
 		float3 tmp = disk_N;
 		disk_N = disk_T;
 		disk_T = tmp;
 		pick_pdf_N = 0.25f;
 		pick_pdf_T = 0.5f;
 		pick_pdf_B = 0.25f;
+		disk_u = (disk_u - 0.5f)*4.0f;
 	}
 	else {
 		float3 tmp = disk_N;
@@ -409,6 +380,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
 		pick_pdf_N = 0.25f;
 		pick_pdf_T = 0.25f;
 		pick_pdf_B = 0.5f;
+		disk_u = (disk_u - 0.75f)*4.0f;
 	}
 
 	/* sample point on disk */
@@ -432,12 +404,16 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
 	/* intersect with the same object. if multiple intersections are
 	 * found it will randomly pick one of them */
 	SubsurfaceIntersection ss_isect;
-	scene_intersect_subsurface(kg, &ray, &ss_isect, sd->object, lcg_state, 1);
+	scene_intersect_subsurface(kg, ray, &ss_isect, sd->object, lcg_state, 1);
 
 	/* evaluate bssrdf */
 	if(ss_isect.num_hits > 0) {
 		float3 origP = sd->P;
 
+		/* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */
+#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__)
+		kernel_split_params.dummy_sd_flag = sd->flag;
+#endif
 		/* setup new shading point */
 		shader_setup_from_subsurface(kg, sd, &ss_isect.hits[0], &ray);
 
@@ -463,7 +439,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
 	subsurface_color_bump_blur(kg, sd, state, state_flag, &eval, &N);
 
 	/* setup diffuse bsdf */
-	subsurface_scatter_setup_diffuse_bsdf(sd, eval, (ss_isect.num_hits > 0), N);
+	subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index 65aeea18336..c8e54954a84 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -15,7 +15,7 @@
  */
 
 #ifndef KERNEL_TEX
-#  define KERNEL_TEX(type, ttype, name)
+#  define KERNEL_TEX(type, name)
 #endif
 
 #ifndef KERNEL_IMAGE_TEX
@@ -23,177 +23,169 @@
 #endif
 
 /* bvh */
-KERNEL_TEX(float4, texture_float4, __bvh_nodes)
-KERNEL_TEX(float4, texture_float4, __bvh_leaf_nodes)
-KERNEL_TEX(float4, texture_float4, __prim_tri_verts)
-KERNEL_TEX(uint, texture_uint, __prim_tri_index)
-KERNEL_TEX(uint, texture_uint, __prim_type)
-KERNEL_TEX(uint, texture_uint, __prim_visibility)
-KERNEL_TEX(uint, texture_uint, __prim_index)
-KERNEL_TEX(uint, texture_uint, __prim_object)
-KERNEL_TEX(uint, texture_uint, __object_node)
+KERNEL_TEX(float4, __bvh_nodes)
+KERNEL_TEX(float4, __bvh_leaf_nodes)
+KERNEL_TEX(float4, __prim_tri_verts)
+KERNEL_TEX(uint, __prim_tri_index)
+KERNEL_TEX(uint, __prim_type)
+KERNEL_TEX(uint, __prim_visibility)
+KERNEL_TEX(uint, __prim_index)
+KERNEL_TEX(uint, __prim_object)
+KERNEL_TEX(uint, __object_node)
+KERNEL_TEX(float2, __prim_time)
 
 /* objects */
-KERNEL_TEX(float4, texture_float4, __objects)
-KERNEL_TEX(float4, texture_float4, __objects_vector)
+KERNEL_TEX(float4, __objects)
+KERNEL_TEX(float4, __objects_vector)
 
 /* triangles */
-KERNEL_TEX(uint, texture_uint, __tri_shader)
-KERNEL_TEX(float4, texture_float4, __tri_vnormal)
-KERNEL_TEX(uint4, texture_uint4, __tri_vindex)
-KERNEL_TEX(uint, texture_uint, __tri_patch)
-KERNEL_TEX(float2, texture_float2, __tri_patch_uv)
+KERNEL_TEX(uint, __tri_shader)
+KERNEL_TEX(float4, __tri_vnormal)
+KERNEL_TEX(uint4, __tri_vindex)
+KERNEL_TEX(uint, __tri_patch)
+KERNEL_TEX(float2, __tri_patch_uv)
 
 /* curves */
-KERNEL_TEX(float4, texture_float4, __curves)
-KERNEL_TEX(float4, texture_float4, __curve_keys)
+KERNEL_TEX(float4, __curves)
+KERNEL_TEX(float4, __curve_keys)
 
 /* patches */
-KERNEL_TEX(uint, texture_uint, __patches)
+KERNEL_TEX(uint, __patches)
 
 /* attributes */
-KERNEL_TEX(uint4, texture_uint4, __attributes_map)
-KERNEL_TEX(float, texture_float, __attributes_float)
-KERNEL_TEX(float4, texture_float4, __attributes_float3)
-KERNEL_TEX(uchar4, texture_uchar4, __attributes_uchar4)
+KERNEL_TEX(uint4, __attributes_map)
+KERNEL_TEX(float, __attributes_float)
+KERNEL_TEX(float4, __attributes_float3)
+KERNEL_TEX(uchar4, __attributes_uchar4)
 
 /* lights */
-KERNEL_TEX(float4, texture_float4, __light_distribution)
-KERNEL_TEX(float4, texture_float4, __light_data)
-KERNEL_TEX(float2, texture_float2, __light_background_marginal_cdf)
-KERNEL_TEX(float2, texture_float2, __light_background_conditional_cdf)
+KERNEL_TEX(float4, __light_distribution)
+KERNEL_TEX(float4, __light_data)
+KERNEL_TEX(float2, __light_background_marginal_cdf)
+KERNEL_TEX(float2, __light_background_conditional_cdf)
 
 /* particles */
-KERNEL_TEX(float4, texture_float4, __particles)
+KERNEL_TEX(float4, __particles)
 
 /* shaders */
-KERNEL_TEX(uint4, texture_uint4, __svm_nodes)
-KERNEL_TEX(uint, texture_uint, __shader_flag)
-KERNEL_TEX(uint, texture_uint, __object_flag)
+KERNEL_TEX(uint4, __svm_nodes)
+KERNEL_TEX(uint, __shader_flag)
+KERNEL_TEX(uint, __object_flag)
 
 /* lookup tables */
-KERNEL_TEX(float, texture_float, __lookup_table)
+KERNEL_TEX(float, __lookup_table)
 
 /* sobol */
-KERNEL_TEX(uint, texture_uint, __sobol_directions)
+KERNEL_TEX(uint, __sobol_directions)
 
 /* volume */
-KERNEL_TEX(uint, texture_uint, __vol_shader)
+KERNEL_TEX(uint, __vol_shader)
 
-#ifdef __KERNEL_CUDA__
-#  if __CUDA_ARCH__ < 300
+#if !defined(__KERNEL_CUDA__) || __CUDA_ARCH__ >= 300
+/* image textures */
+KERNEL_TEX(TextureInfo, __texture_info)
+#else
 /* full-float image */
 KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_002)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_003)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_004)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_008)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_016)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_024)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_032)
 
 KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_000)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_001)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_002)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_003)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_004)
-
-/* image */
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_005)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_006)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_007)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_008)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_008)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_016)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_024)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_032)
+
+/* image
+ * These texture names are encoded to their flattened slots as
+ * ImageManager::type_index_to_flattened_slot() returns them. */
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_001)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_009)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_010)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_011)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_012)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_013)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_014)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_015)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_016)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_017)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_018)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_019)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_020)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_021)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_022)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_023)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_024)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_025)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_026)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_027)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_028)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_029)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_030)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_031)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_032)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_033)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_034)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_035)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_036)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_037)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_038)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_039)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_040)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_041)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_042)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_043)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_044)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_045)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_046)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_047)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_048)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_049)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_050)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_051)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_052)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_053)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_054)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_055)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_056)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_057)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_058)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_059)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_060)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_061)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_062)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_063)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_064)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_065)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_066)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_067)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_068)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_069)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_070)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_071)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_072)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_073)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_074)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_075)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_076)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_077)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_078)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_079)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_080)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_081)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_082)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_083)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_084)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089)
-
-#  else
-/* bindless textures */
-KERNEL_TEX(uint, texture_uint, __bindless_mapping)
-#  endif
-#endif
-
-/* packed image (opencl) */
-KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed)
-KERNEL_TEX(float4, texture_float4, __tex_image_float4_packed)
-KERNEL_TEX(uchar, texture_uchar, __tex_image_byte_packed)
-KERNEL_TEX(float, texture_float, __tex_image_float_packed)
-KERNEL_TEX(uint4, texture_uint4, __tex_image_packed_info)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_153)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_161)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_169)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_177)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_185)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_193)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_201)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_209)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_217)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_225)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_233)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_241)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_249)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_257)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_265)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_273)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_281)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_289)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_297)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_305)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_313)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_321)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_329)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_337)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_345)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_353)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_361)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_369)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_377)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_385)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_393)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_401)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_409)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_417)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_425)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_433)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_441)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_449)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_457)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_465)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_473)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_481)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_489)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_497)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_505)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_513)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_521)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_529)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_537)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_545)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_553)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_561)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_569)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_577)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_585)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_593)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_601)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_609)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_617)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_625)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_633)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_641)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_649)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_657)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665)
+#endif  /* defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300 */
 
 #undef KERNEL_TEX
 #undef KERNEL_IMAGE_TEX
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 15960dba40d..6c8e1c4e336 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -17,9 +17,9 @@
 #ifndef __KERNEL_TYPES_H__
 #define __KERNEL_TYPES_H__
 
-#include "kernel_math.h"
-#include "svm/svm_types.h"
-#include "util_static_assert.h"
+#include "kernel/kernel_math.h"
+#include "kernel/svm/svm_types.h"
+#include "util/util_static_assert.h"
 
 #ifndef __KERNEL_GPU__
 #  define __KERNEL_CPU__
@@ -56,6 +56,26 @@ CCL_NAMESPACE_BEGIN
 
 #define VOLUME_STACK_SIZE		16
 
+#define WORK_POOL_SIZE_GPU 64
+#define WORK_POOL_SIZE_CPU 1
+#ifdef __KERNEL_GPU__
+#  define WORK_POOL_SIZE WORK_POOL_SIZE_GPU
+#else
+#  define WORK_POOL_SIZE WORK_POOL_SIZE_CPU
+#endif
+
+
+#define SHADER_SORT_BLOCK_SIZE 2048
+
+#ifdef __KERNEL_OPENCL__
+#  define SHADER_SORT_LOCAL_SIZE 64
+#elif defined(__KERNEL_CUDA__)
+#  define SHADER_SORT_LOCAL_SIZE 32
+#else
+#  define SHADER_SORT_LOCAL_SIZE 1
+#endif
+
+
 /* device capabilities */
 #ifdef __KERNEL_CPU__
 #  ifdef __KERNEL_SSE2__
@@ -70,23 +90,28 @@ CCL_NAMESPACE_BEGIN
 #  ifdef WITH_OPENVDB
 #    define __OPENVDB__
 #  endif
+#  define __PRINCIPLED__
 #  define __SUBSURFACE__
 #  define __CMJ__
 #  define __VOLUME__
-#  define __VOLUME_DECOUPLED__
 #  define __VOLUME_SCATTER__
 #  define __SHADOW_RECORD_ALL__
+#  define __VOLUME_DECOUPLED__
 #  define __VOLUME_RECORD_ALL__
 #endif  /* __KERNEL_CPU__ */
 
 #ifdef __KERNEL_CUDA__
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
-#  define __BRANCHED_PATH__
 #  define __VOLUME__
 #  define __VOLUME_SCATTER__
 #  define __SUBSURFACE__
+#  define __PRINCIPLED__
+#  define __SHADOW_RECORD_ALL__
 #  define __CMJ__
+#  ifndef __SPLIT_KERNEL__
+#    define __BRANCHED_PATH__
+#  endif
 #endif  /* __KERNEL_CUDA__ */
 
 #ifdef __KERNEL_OPENCL__
@@ -96,36 +121,45 @@ CCL_NAMESPACE_BEGIN
 #  ifdef __KERNEL_OPENCL_NVIDIA__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
-#    ifdef __KERNEL_EXPERIMENTAL__
-#      define __CMJ__
-#    endif
+#    define __SUBSURFACE__
+#    define __PRINCIPLED__
+#    define __VOLUME__
+#    define __VOLUME_SCATTER__
+#    define __SHADOW_RECORD_ALL__
+#    define __CMJ__
+#    define __BRANCHED_PATH__
 #  endif  /* __KERNEL_OPENCL_NVIDIA__ */
 
 #  ifdef __KERNEL_OPENCL_APPLE__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
+#    define __PRINCIPLED__
+#    define __CMJ__
 /* TODO(sergey): Currently experimental section is ignored here,
  * this is because megakernel in device_opencl does not support
  * custom cflags depending on the scene features.
  */
-#    ifdef __KERNEL_EXPERIMENTAL__
-#      define __CMJ__
-#    endif
-#  endif  /* __KERNEL_OPENCL_NVIDIA__ */
+#  endif  /* __KERNEL_OPENCL_APPLE__ */
 
 #  ifdef __KERNEL_OPENCL_AMD__
 #    define __CL_USE_NATIVE__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
+#    define __SUBSURFACE__
+#    define __PRINCIPLED__
+#    define __VOLUME__
+#    define __VOLUME_SCATTER__
+#    define __SHADOW_RECORD_ALL__
+#    define __CMJ__
+#    define __BRANCHED_PATH__
 #  endif  /* __KERNEL_OPENCL_AMD__ */
 
 #  ifdef __KERNEL_OPENCL_INTEL_CPU__
 #    define __CL_USE_NATIVE__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
-#    ifdef __KERNEL_EXPERIMENTAL__
-#      define __CMJ__
-#    endif
+#    define __PRINCIPLED__
+#    define __CMJ__
 #  endif  /* __KERNEL_OPENCL_INTEL_CPU__ */
 
 #endif  /* __KERNEL_OPENCL__ */
@@ -143,6 +177,9 @@ CCL_NAMESPACE_BEGIN
 #define __INTERSECTION_REFINE__
 #define __CLAMP_SAMPLE__
 #define __PATCH_EVAL__
+#define __SHADOW_TRICKS__
+
+#define __DENOISING_FEATURES__
 
 #ifdef __KERNEL_SHADING__
 #  define __SVM__
@@ -195,10 +232,18 @@ CCL_NAMESPACE_BEGIN
 #ifdef __NO_PATCH_EVAL__
 #  undef __PATCH_EVAL__
 #endif
-
-/* Random Numbers */
-
-typedef uint RNG;
+#ifdef __NO_TRANSPARENT__
+#  undef __TRANSPARENT_SHADOWS__
+#endif
+#ifdef __NO_SHADOW_TRICKS__
+#  undef __SHADOW_TRICKS__
+#endif
+#ifdef __NO_PRINCIPLED__
+#  undef __PRINCIPLED__
+#endif
+#ifdef __NO_DENOISING__
+#  undef __DENOISING_FEATURES__
+#endif
 
 /* Shader Evaluation */
 
@@ -239,31 +284,21 @@ enum PathTraceDimension {
 	PRNG_FILTER_V = 1,
 	PRNG_LENS_U = 2,
 	PRNG_LENS_V = 3,
-#ifdef __CAMERA_MOTION__
 	PRNG_TIME = 4,
 	PRNG_UNUSED_0 = 5,
 	PRNG_UNUSED_1 = 6,	/* for some reason (6, 7) is a bad sobol pattern */
 	PRNG_UNUSED_2 = 7,  /* with a low number of samples (< 64) */
-#endif
-	PRNG_BASE_NUM = 8,
+	PRNG_BASE_NUM = 10,
 
 	PRNG_BSDF_U = 0,
 	PRNG_BSDF_V = 1,
-	PRNG_BSDF = 2,
-	PRNG_LIGHT = 3,
-	PRNG_LIGHT_U = 4,
-	PRNG_LIGHT_V = 5,
-	PRNG_LIGHT_TERMINATE = 6,
-	PRNG_TERMINATE = 7,
-
-#ifdef __VOLUME__
-	PRNG_PHASE_U = 8,
-	PRNG_PHASE_V = 9,
-	PRNG_PHASE = 10,
-	PRNG_SCATTER_DISTANCE = 11,
-#endif
-
-	PRNG_BOUNCE_NUM = 12,
+	PRNG_LIGHT_U = 2,
+	PRNG_LIGHT_V = 3,
+	PRNG_LIGHT_TERMINATE = 4,
+	PRNG_TERMINATE = 5,
+	PRNG_PHASE_CHANNEL = 6,
+	PRNG_SCATTER_DISTANCE = 7,
+	PRNG_BOUNCE_NUM = 8,
 };
 
 enum SamplingPattern {
@@ -276,29 +311,36 @@ enum SamplingPattern {
 /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
 
 enum PathRayFlag {
-	PATH_RAY_CAMERA = 1,
-	PATH_RAY_REFLECT = 2,
-	PATH_RAY_TRANSMIT = 4,
-	PATH_RAY_DIFFUSE = 8,
-	PATH_RAY_GLOSSY = 16,
-	PATH_RAY_SINGULAR = 32,
-	PATH_RAY_TRANSPARENT = 64,
-
-	PATH_RAY_SHADOW_OPAQUE = 128,
-	PATH_RAY_SHADOW_TRANSPARENT = 256,
-	PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
-
-	PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
-	PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
+	PATH_RAY_CAMERA              = (1 << 0),
+	PATH_RAY_REFLECT             = (1 << 1),
+	PATH_RAY_TRANSMIT            = (1 << 2),
+	PATH_RAY_DIFFUSE             = (1 << 3),
+	PATH_RAY_GLOSSY              = (1 << 4),
+	PATH_RAY_SINGULAR            = (1 << 5),
+	PATH_RAY_TRANSPARENT         = (1 << 6),
+
+	PATH_RAY_SHADOW_OPAQUE_NON_CATCHER       = (1 << 7),
+	PATH_RAY_SHADOW_OPAQUE_CATCHER           = (1 << 8),
+	PATH_RAY_SHADOW_OPAQUE                   = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_OPAQUE_CATCHER),
+	PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER  = (1 << 9),
+	PATH_RAY_SHADOW_TRANSPARENT_CATCHER      = (1 << 10),
+	PATH_RAY_SHADOW_TRANSPARENT              = (PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_CATCHER),
+	PATH_RAY_SHADOW_NON_CATCHER              = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER),
+	PATH_RAY_SHADOW                          = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
+
+	PATH_RAY_CURVE               = (1 << 11), /* visibility flag to define curve segments */
+	PATH_RAY_VOLUME_SCATTER      = (1 << 12), /* volume scattering */
 
 	/* Special flag to tag unaligned BVH nodes. */
-	PATH_RAY_NODE_UNALIGNED = 2048,
+	PATH_RAY_NODE_UNALIGNED = (1 << 13),
 
-	PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048),
+	PATH_RAY_ALL_VISIBILITY = ((1 << 14)-1),
 
-	PATH_RAY_MIS_SKIP = 4096,
-	PATH_RAY_DIFFUSE_ANCESTOR = 8192,
-	PATH_RAY_SINGLE_PASS_DONE = 16384,
+	PATH_RAY_MIS_SKIP            = (1 << 15),
+	PATH_RAY_DIFFUSE_ANCESTOR    = (1 << 16),
+	PATH_RAY_SINGLE_PASS_DONE    = (1 << 17),
+	PATH_RAY_SHADOW_CATCHER      = (1 << 18),
+	PATH_RAY_STORE_SHADOW_INFO   = (1 << 19),
 };
 
 /* Closure Label */
@@ -345,14 +387,31 @@ typedef enum PassType {
 	PASS_SUBSURFACE_COLOR = (1 << 24),
 	PASS_LIGHT = (1 << 25), /* no real pass, used to force use_light_pass */
 #ifdef __KERNEL_DEBUG__
-	PASS_BVH_TRAVERSAL_STEPS = (1 << 26),
+	PASS_BVH_TRAVERSED_NODES = (1 << 26),
 	PASS_BVH_TRAVERSED_INSTANCES = (1 << 27),
-	PASS_RAY_BOUNCES = (1 << 28),
+	PASS_BVH_INTERSECTIONS = (1 << 28),
+	PASS_RAY_BOUNCES = (1 << 29),
 #endif
 } PassType;
 
 #define PASS_ALL (~0)
 
+typedef enum DenoisingPassOffsets {
+	DENOISING_PASS_NORMAL             = 0,
+	DENOISING_PASS_NORMAL_VAR         = 3,
+	DENOISING_PASS_ALBEDO             = 6,
+	DENOISING_PASS_ALBEDO_VAR         = 9,
+	DENOISING_PASS_DEPTH              = 12,
+	DENOISING_PASS_DEPTH_VAR          = 13,
+	DENOISING_PASS_SHADOW_A           = 14,
+	DENOISING_PASS_SHADOW_B           = 17,
+	DENOISING_PASS_COLOR              = 20,
+	DENOISING_PASS_COLOR_VAR          = 23,
+
+	DENOISING_PASS_SIZE_BASE          = 26,
+	DENOISING_PASS_SIZE_CLEAN         = 3,
+} DenoisingPassOffsets;
+
 typedef enum BakePassFilter {
 	BAKE_FILTER_NONE = 0,
 	BAKE_FILTER_DIRECT = (1 << 0),
@@ -386,18 +445,54 @@ typedef enum BakePassFilterCombos {
 	BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
 } BakePassFilterCombos;
 
+typedef enum DenoiseFlag {
+	DENOISING_CLEAN_DIFFUSE_DIR      = (1 << 0),
+	DENOISING_CLEAN_DIFFUSE_IND      = (1 << 1),
+	DENOISING_CLEAN_GLOSSY_DIR       = (1 << 2),
+	DENOISING_CLEAN_GLOSSY_IND       = (1 << 3),
+	DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
+	DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
+	DENOISING_CLEAN_SUBSURFACE_DIR   = (1 << 6),
+	DENOISING_CLEAN_SUBSURFACE_IND   = (1 << 7),
+	DENOISING_CLEAN_ALL_PASSES       = (1 << 8)-1,
+} DenoiseFlag;
+
+#ifdef __KERNEL_DEBUG__
+/* NOTE: This is a runtime-only struct, alignment is not
+ * really important here.
+ */
+typedef struct DebugData {
+	int num_bvh_traversed_nodes;
+	int num_bvh_traversed_instances;
+	int num_bvh_intersections;
+	int num_ray_bounces;
+} DebugData;
+#endif
+
+typedef ccl_addr_space struct PathRadianceState {
+#ifdef __PASSES__
+	float3 diffuse;
+	float3 glossy;
+	float3 transmission;
+	float3 subsurface;
+	float3 scatter;
+
+	float3 direct;
+#endif
+} PathRadianceState;
+
 typedef ccl_addr_space struct PathRadiance {
 #ifdef __PASSES__
 	int use_light_pass;
 #endif
 
+	float transparent;
 	float3 emission;
 #ifdef __PASSES__
 	float3 background;
 	float3 ao;
 
 	float3 indirect;
-	float3 direct_throughput;
 	float3 direct_emission;
 
 	float3 color_diffuse;
@@ -418,15 +513,46 @@ typedef ccl_addr_space struct PathRadiance {
 	float3 indirect_subsurface;
 	float3 indirect_scatter;
 
-	float3 path_diffuse;
-	float3 path_glossy;
-	float3 path_transmission;
-	float3 path_subsurface;
-	float3 path_scatter;
-
 	float4 shadow;
 	float mist;
 #endif
+
+	struct PathRadianceState state;
+
+#ifdef __SHADOW_TRICKS__
+	/* Total light reachable across the path, ignoring shadow blocked queries. */
+	float3 path_total;
+	/* Total light reachable across the path with shadow blocked queries
+	 * applied here.
+	 *
+	 * Dividing this figure by path_total will give estimate of shadow pass.
+	 */
+	float3 path_total_shaded;
+
+	/* Color of the background on which shadow is alpha-overed. */
+	float3 shadow_background_color;
+
+	/* Path radiance sum and throughput at the moment when ray hits shadow
+	 * catcher object.
+	 */
+	float shadow_throughput;
+
+	/* Accumulated transparency along the path after shadow catcher bounce. */
+	float shadow_transparency;
+
+	/* Indicate if any shadow catcher data is set. */
+	int has_shadow_catcher;
+#endif
+
+#ifdef __DENOISING_FEATURES__
+	float3 denoising_normal;
+	float3 denoising_albedo;
+	float denoising_depth;
+#endif  /* __DENOISING_FEATURES__ */
+
+#ifdef __KERNEL_DEBUG__
+	DebugData debug_data;
+#endif /* __KERNEL_DEBUG__ */
 } PathRadiance;
 
 typedef struct BsdfEval {
@@ -442,6 +568,9 @@ typedef struct BsdfEval {
 	float3 subsurface;
 	float3 scatter;
 #endif
+#ifdef __SHADOW_TRICKS__
+	float3 sum_no_mis;
+#endif
 } BsdfEval;
 
 /* Shader Flag */
@@ -535,29 +664,32 @@ typedef struct Ray {
 
 /* Intersection */
 
-typedef ccl_addr_space struct Intersection {
+typedef struct Intersection {
 	float t, u, v;
 	int prim;
 	int object;
 	int type;
 
 #ifdef __KERNEL_DEBUG__
-	int num_traversal_steps;
+	int num_traversed_nodes;
 	int num_traversed_instances;
+	int num_intersections;
 #endif
 } Intersection;
 
 /* Primitives */
 
 typedef enum PrimitiveType {
-	PRIMITIVE_NONE = 0,
-	PRIMITIVE_TRIANGLE = 1,
-	PRIMITIVE_MOTION_TRIANGLE = 2,
-	PRIMITIVE_CURVE = 4,
-	PRIMITIVE_MOTION_CURVE = 8,
-	/* Lamp primitive is not included below on purpose, since it is no real traceable primitive */
-	PRIMITIVE_LAMP = 16,
-	PRIMITIVE_VOLUME = 32,
+	PRIMITIVE_NONE            = 0,
+	PRIMITIVE_TRIANGLE        = (1 << 0),
+	PRIMITIVE_MOTION_TRIANGLE = (1 << 1),
+	PRIMITIVE_CURVE           = (1 << 2),
+	PRIMITIVE_MOTION_CURVE    = (1 << 3),
+	/* Lamp primitive is not included below on purpose,
+	 * since it is no real traceable primitive.
+	 */
+	PRIMITIVE_LAMP            = (1 << 4),
+	PRIMITIVE_VOLUME          = (1 << 5),
 
 	PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE|PRIMITIVE_MOTION_TRIANGLE),
 	PRIMITIVE_ALL_CURVE = (PRIMITIVE_CURVE|PRIMITIVE_MOTION_CURVE),
@@ -565,14 +697,14 @@ typedef enum PrimitiveType {
 	PRIMITIVE_ALL_VOLUME = (PRIMITIVE_VOLUME),
 	PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE|PRIMITIVE_ALL_CURVE|PRIMITIVE_ALL_VOLUME),
 
-	/* Total number of different primitives.
+	/* Total number of different traceable primitives.
 	 * NOTE: This is an actual value, not a bitflag.
 	 */
 	PRIMITIVE_NUM_TOTAL = 4,
 } PrimitiveType;
 
-#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << 16) | type)
-#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> 16)
+#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM_TOTAL) | (type))
+#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM_TOTAL)
 
 /* Attributes */
 
@@ -665,175 +797,197 @@ typedef struct AttributeDescriptor {
 #define SHADER_CLOSURE_BASE \
 	float3 weight; \
 	ClosureType type; \
-	float sample_weight \
+	float sample_weight; \
+	float3 N
 
 typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
 	SHADER_CLOSURE_BASE;
 
-	float data[14]; /* pad to 80 bytes */
+	float data[10]; /* pad to 80 bytes */
 } ShaderClosure;
 
-/* Shader Context
- *
- * For OSL we recycle a fixed number of contexts for speed */
-
-typedef enum ShaderContext {
-	SHADER_CONTEXT_MAIN = 0,
-	SHADER_CONTEXT_INDIRECT = 1,
-	SHADER_CONTEXT_EMISSION = 2,
-	SHADER_CONTEXT_SHADOW = 3,
-	SHADER_CONTEXT_SSS = 4,
-	SHADER_CONTEXT_VOLUME = 5,
-	SHADER_CONTEXT_NUM = 6
-} ShaderContext;
-
 /* Shader Data
  *
  * Main shader state at a point on the surface or in a volume. All coordinates
- * are in world space. */
+ * are in world space.
+ */
 
 enum ShaderDataFlag {
-	/* runtime flags */
-	SD_BACKFACING      = (1 << 0),   /* backside of surface? */
-	SD_EMISSION        = (1 << 1),   /* have emissive closure? */
-	SD_BSDF            = (1 << 2),   /* have bsdf closure? */
-	SD_BSDF_HAS_EVAL   = (1 << 3),   /* have non-singular bsdf closure? */
-	SD_BSSRDF          = (1 << 4),   /* have bssrdf */
-	SD_HOLDOUT         = (1 << 5),   /* have holdout closure? */
-	SD_ABSORPTION      = (1 << 6),   /* have volume absorption closure? */
-	SD_SCATTER         = (1 << 7),   /* have volume phase closure? */
-	SD_AO              = (1 << 8),   /* have ao closure? */
-	SD_TRANSPARENT     = (1 << 9),  /* have transparent closure? */
+	/* Runtime flags. */
+
+	/* Set when ray hits backside of surface. */
+	SD_BACKFACING      = (1 << 0),
+	/* Shader has emissive closure. */
+	SD_EMISSION        = (1 << 1),
+	/* Shader has BSDF closure. */
+	SD_BSDF            = (1 << 2),
+	/* Shader has non-singular BSDF closure. */
+	SD_BSDF_HAS_EVAL   = (1 << 3),
+	/* Shader has BSSRDF closure. */
+	SD_BSSRDF          = (1 << 4),
+	/* Shader has holdout closure. */
+	SD_HOLDOUT         = (1 << 5),
+	/* Shader has volume absorption closure. */
+	SD_ABSORPTION      = (1 << 6),
+	/* Shader has have volume phase (scatter) closure. */
+	SD_SCATTER         = (1 << 7),
+	/* Shader has AO closure. */
+	SD_AO              = (1 << 8),
+	/* Shader has transparent closure. */
+	SD_TRANSPARENT     = (1 << 9),
+	/* BSDF requires LCG for evaluation. */
 	SD_BSDF_NEEDS_LCG  = (1 << 10),
 
-	SD_CLOSURE_FLAGS = (SD_EMISSION|SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF|
-	                    SD_HOLDOUT|SD_ABSORPTION|SD_SCATTER|SD_AO|
+	SD_CLOSURE_FLAGS = (SD_EMISSION |
+	                    SD_BSDF |
+	                    SD_BSDF_HAS_EVAL |
+	                    SD_BSSRDF |
+	                    SD_HOLDOUT |
+	                    SD_ABSORPTION |
+	                    SD_SCATTER |
+	                    SD_AO |
 	                    SD_BSDF_NEEDS_LCG),
 
-	/* shader flags */
-	SD_USE_MIS                = (1 << 12),  /* direct light sample */
-	SD_HAS_TRANSPARENT_SHADOW = (1 << 13),  /* has transparent shadow */
-	SD_HAS_VOLUME             = (1 << 14),  /* has volume shader */
-	SD_HAS_ONLY_VOLUME        = (1 << 15),  /* has only volume shader, no surface */
-	SD_HETEROGENEOUS_VOLUME   = (1 << 16),  /* has heterogeneous volume */
-	SD_HAS_BSSRDF_BUMP        = (1 << 17),  /* bssrdf normal uses bump */
-	SD_VOLUME_EQUIANGULAR     = (1 << 18),  /* use equiangular sampling */
-	SD_VOLUME_MIS             = (1 << 19),  /* use multiple importance sampling */
-	SD_VOLUME_CUBIC           = (1 << 20),  /* use cubic interpolation for voxels */
-	SD_HAS_BUMP               = (1 << 21),  /* has data connected to the displacement input */
-	SD_HAS_DISPLACEMENT       = (1 << 22),  /* has true displacement */
-	SD_HAS_CONSTANT_EMISSION  = (1 << 23),  /* has constant emission (value stored in __shader_flag) */
-
-	SD_SHADER_FLAGS = (SD_USE_MIS|SD_HAS_TRANSPARENT_SHADOW|SD_HAS_VOLUME|
-	                   SD_HAS_ONLY_VOLUME|SD_HETEROGENEOUS_VOLUME|
-	                   SD_HAS_BSSRDF_BUMP|SD_VOLUME_EQUIANGULAR|SD_VOLUME_MIS|
-	                   SD_VOLUME_CUBIC|SD_HAS_BUMP|SD_HAS_DISPLACEMENT|SD_HAS_CONSTANT_EMISSION),
-
-	/* object flags */
-	SD_HOLDOUT_MASK             = (1 << 24),  /* holdout for camera rays */
-	SD_OBJECT_MOTION            = (1 << 25),  /* has object motion blur */
-	SD_TRANSFORM_APPLIED        = (1 << 26),  /* vertices have transform applied */
-	SD_NEGATIVE_SCALE_APPLIED   = (1 << 27),  /* vertices have negative scale applied */
-	SD_OBJECT_HAS_VOLUME        = (1 << 28),  /* object has a volume shader */
-	SD_OBJECT_INTERSECTS_VOLUME = (1 << 29),  /* object intersects AABB of an object with volume shader */
-	SD_OBJECT_HAS_VERTEX_MOTION = (1 << 30),  /* has position for motion vertices */
-
-	SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED|
-	                   SD_NEGATIVE_SCALE_APPLIED|SD_OBJECT_HAS_VOLUME|
-	                   SD_OBJECT_INTERSECTS_VOLUME)
+	/* Shader flags. */
+
+	/* direct light sample */
+	SD_USE_MIS                = (1 << 16),
+	/* Has transparent shadow. */
+	SD_HAS_TRANSPARENT_SHADOW = (1 << 17),
+	/* Has volume shader. */
+	SD_HAS_VOLUME             = (1 << 18),
+	/* Has only volume shader, no surface. */
+	SD_HAS_ONLY_VOLUME        = (1 << 19),
+	/* Has heterogeneous volume. */
+	SD_HETEROGENEOUS_VOLUME   = (1 << 20),
+	/* BSSRDF normal uses bump. */
+	SD_HAS_BSSRDF_BUMP        = (1 << 21),
+	/* Use equiangular volume sampling */
+	SD_VOLUME_EQUIANGULAR     = (1 << 22),
+	/* Use multiple importance volume sampling. */
+	SD_VOLUME_MIS             = (1 << 23),
+	/* Use cubic interpolation for voxels. */
+	SD_VOLUME_CUBIC           = (1 << 24),
+	/* Has data connected to the displacement input or uses bump map. */
+	SD_HAS_BUMP               = (1 << 25),
+	/* Has true displacement. */
+	SD_HAS_DISPLACEMENT       = (1 << 26),
+	/* Has constant emission (value stored in __shader_flag) */
+	SD_HAS_CONSTANT_EMISSION  = (1 << 27),
+
+	SD_SHADER_FLAGS = (SD_USE_MIS |
+	                   SD_HAS_TRANSPARENT_SHADOW |
+	                   SD_HAS_VOLUME |
+	                   SD_HAS_ONLY_VOLUME |
+	                   SD_HETEROGENEOUS_VOLUME|
+	                   SD_HAS_BSSRDF_BUMP |
+	                   SD_VOLUME_EQUIANGULAR |
+	                   SD_VOLUME_MIS |
+	                   SD_VOLUME_CUBIC |
+	                   SD_HAS_BUMP |
+	                   SD_HAS_DISPLACEMENT |
+	                   SD_HAS_CONSTANT_EMISSION)
 };
 
-#ifdef __SPLIT_KERNEL__
-#  define SD_THREAD (get_global_id(1) * get_global_size(0) + get_global_id(0))
-#  if !defined(__SPLIT_KERNEL_SOA__)
-     /* ShaderData is stored as an Array-of-Structures */
-#    define ccl_soa_member(type, name) type soa_##name
-#    define ccl_fetch(s, t) (s[SD_THREAD].soa_##t)
-#    define ccl_fetch_array(s, t, index) (&s[SD_THREAD].soa_##t[index])
-#  else
-     /* ShaderData is stored as an Structure-of-Arrays */
-#    define SD_GLOBAL_SIZE (get_global_size(0) * get_global_size(1))
-#    define SD_FIELD_SIZE(t) sizeof(((struct ShaderData*)0)->t)
-#    define SD_OFFSETOF(t) ((char*)(&((struct ShaderData*)0)->t) - (char*)0)
-#    define ccl_soa_member(type, name) type soa_##name
-#    define ccl_fetch(s, t) (((ShaderData*)((ccl_addr_space char*)s + SD_GLOBAL_SIZE * SD_OFFSETOF(soa_##t) +  SD_FIELD_SIZE(soa_##t) * SD_THREAD - SD_OFFSETOF(soa_##t)))->soa_##t)
-#    define ccl_fetch_array(s, t, index) (&ccl_fetch(s, t)[index])
-#  endif
-#else
-#  define ccl_soa_member(type, name) type name
-#  define ccl_fetch(s, t) (s->t)
-#  define ccl_fetch_array(s, t, index) (&s->t[index])
-#endif
+	/* Object flags. */
+enum ShaderDataObjectFlag {
+	/* Holdout for camera rays. */
+	SD_OBJECT_HOLDOUT_MASK           = (1 << 0),
+	/* Has object motion blur. */
+	SD_OBJECT_MOTION                 = (1 << 1),
+	/* Vertices have transform applied. */
+	SD_OBJECT_TRANSFORM_APPLIED      = (1 << 2),
+	/* Vertices have negative scale applied. */
+	SD_OBJECT_NEGATIVE_SCALE_APPLIED = (1 << 3),
+	/* Object has a volume shader. */
+	SD_OBJECT_HAS_VOLUME             = (1 << 4),
+	/* Object intersects AABB of an object with volume shader. */
+	SD_OBJECT_INTERSECTS_VOLUME      = (1 << 5),
+	/* Has position for motion vertices. */
+	SD_OBJECT_HAS_VERTEX_MOTION      = (1 << 6),
+	/* object is used to catch shadows */
+	SD_OBJECT_SHADOW_CATCHER         = (1 << 7),
+
+	SD_OBJECT_FLAGS = (SD_OBJECT_HOLDOUT_MASK |
+	                   SD_OBJECT_MOTION |
+	                   SD_OBJECT_TRANSFORM_APPLIED |
+	                   SD_OBJECT_NEGATIVE_SCALE_APPLIED |
+	                   SD_OBJECT_HAS_VOLUME |
+	                   SD_OBJECT_INTERSECTS_VOLUME |
+	                   SD_OBJECT_SHADOW_CATCHER)
+};
 
 typedef ccl_addr_space struct ShaderData {
 	/* position */
-	ccl_soa_member(float3, P);
+	float3 P;
 	/* smooth normal for shading */
-	ccl_soa_member(float3, N);
+	float3 N;
 	/* true geometric normal */
-	ccl_soa_member(float3, Ng);
+	float3 Ng;
 	/* view/incoming direction */
-	ccl_soa_member(float3, I);
+	float3 I;
 	/* shader id */
-	ccl_soa_member(int, shader);
+	int shader;
 	/* booleans describing shader, see ShaderDataFlag */
-	ccl_soa_member(int, flag);
+	int flag;
+	/* booleans describing object of the shader, see ShaderDataObjectFlag */
+	int object_flag;
 
 	/* primitive id if there is one, ~0 otherwise */
-	ccl_soa_member(int, prim);
+	int prim;
 
 	/* combined type and curve segment for hair */
-	ccl_soa_member(int, type);
+	int type;
 
 	/* parametric coordinates
 	 * - barycentric weights for triangles */
-	ccl_soa_member(float, u);
-	ccl_soa_member(float, v);
+	float u;
+	float v;
 	/* object id if there is one, ~0 otherwise */
-	ccl_soa_member(int, object);
+	int object;
 
 	/* motion blur sample time */
-	ccl_soa_member(float, time);
+	float time;
 
 	/* length of the ray being shaded */
-	ccl_soa_member(float, ray_length);
+	float ray_length;
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differential of P. these are orthogonal to Ng, not N */
-	ccl_soa_member(differential3, dP);
+	differential3 dP;
 	/* differential of I */
-	ccl_soa_member(differential3, dI);
+	differential3 dI;
 	/* differential of u, v */
-	ccl_soa_member(differential, du);
-	ccl_soa_member(differential, dv);
+	differential du;
+	differential dv;
 #endif
 #ifdef __DPDU__
 	/* differential of P w.r.t. parametric coordinates. note that dPdu is
 	 * not readily suitable as a tangent for shading on triangles. */
-	ccl_soa_member(float3, dPdu);
-	ccl_soa_member(float3, dPdv);
+	float3 dPdu;
+	float3 dPdv;
 #endif
 
 #ifdef __OBJECT_MOTION__
 	/* object <-> world space transformations, cached to avoid
 	 * re-interpolating them constantly for shading */
-	ccl_soa_member(Transform, ob_tfm);
-	ccl_soa_member(Transform, ob_itfm);
+	Transform ob_tfm;
+	Transform ob_itfm;
 #endif
 
 	/* Closure data, we store a fixed array of closures */
-	ccl_soa_member(struct ShaderClosure, closure[MAX_CLOSURE]);
-	ccl_soa_member(int, num_closure);
-	ccl_soa_member(int, num_closure_extra);
-	ccl_soa_member(float, randb_closure);
-	ccl_soa_member(float3, svm_closure_weight);
+	struct ShaderClosure closure[MAX_CLOSURE];
+	int num_closure;
+	int num_closure_extra;
+	float randb_closure;
+	float3 svm_closure_weight;
 
 	/* LCG state for closures that require additional random numbers. */
-	ccl_soa_member(uint, lcg_state);
+	uint lcg_state;
 
 	/* ray start position, only set for backgrounds */
-	ccl_soa_member(float3, ray_P);
-	ccl_soa_member(differential3, ray_dP);
+	float3 ray_P;
+	differential3 ray_dP;
 
 #ifdef __OSL__
 	struct KernelGlobals *osl_globals;
@@ -856,9 +1010,11 @@ typedef struct PathState {
 	int flag;
 
 	/* random number generator state */
-	int rng_offset;    		/* dimension offset */
-	int sample;        		/* path sample number */
-	int num_samples;		/* total number of times this path will be sampled */
+	uint rng_hash;          /* per pixel hash */
+	int rng_offset;         /* dimension offset */
+	int sample;             /* path sample number */
+	int num_samples;        /* total number of times this path will be sampled */
+	float branch_factor;    /* number of branches in indirect paths */
 
 	/* bounce counting */
 	int bounce;
@@ -867,6 +1023,10 @@ typedef struct PathState {
 	int transmission_bounce;
 	int transparent_bounce;
 
+#ifdef __DENOISING_FEATURES__
+	float denoising_feature_weight;
+#endif  /* __DENOISING_FEATURES__ */
+
 	/* multiple importance sampling */
 	float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
 	float ray_pdf;     /* last bounce pdf */
@@ -877,7 +1037,7 @@ typedef struct PathState {
 	/* volume rendering */
 #ifdef __VOLUME__
 	int volume_bounce;
-	RNG rng_congruential;
+	uint rng_congruential;
 	VolumeStack volume_stack[VOLUME_STACK_SIZE];
 #endif
 } PathState;
@@ -885,29 +1045,25 @@ typedef struct PathState {
 /* Subsurface */
 
 /* Struct to gather multiple SSS hits. */
-struct SubsurfaceIntersection
-{
+typedef struct SubsurfaceIntersection {
 	Ray ray;
 	float3 weight[BSSRDF_MAX_HITS];
 
 	int num_hits;
 	struct Intersection hits[BSSRDF_MAX_HITS];
 	float3 Ng[BSSRDF_MAX_HITS];
-};
+} SubsurfaceIntersection;
 
 /* Struct to gather SSS indirect rays and delay tracing them. */
-struct SubsurfaceIndirectRays
-{
-	bool need_update_volume_stack;
-	bool tracing;
+typedef struct SubsurfaceIndirectRays {
 	PathState state[BSSRDF_MAX_HITS];
-	struct PathRadiance direct_L;
 
 	int num_rays;
+
 	struct Ray rays[BSSRDF_MAX_HITS];
 	float3 throughputs[BSSRDF_MAX_HITS];
-	struct PathRadiance L[BSSRDF_MAX_HITS];
-};
+	struct PathRadianceState L_state[BSSRDF_MAX_HITS];
+} SubsurfaceIndirectRays;
 
 /* Constant Kernel Data
  *
@@ -1040,11 +1196,16 @@ typedef struct KernelFilm {
 	float mist_inv_depth;
 	float mist_falloff;
 
+	int pass_denoising_data;
+	int pass_denoising_clean;
+	int denoising_flags;
+	int pad;
+
 #ifdef __KERNEL_DEBUG__
-	int pass_bvh_traversal_steps;
+	int pass_bvh_traversed_nodes;
 	int pass_bvh_traversed_instances;
+	int pass_bvh_intersections;
 	int pass_ray_bounces;
-	int pass_pad3;
 #endif
 } KernelFilm;
 static_assert_align(KernelFilm, 16);
@@ -1080,7 +1241,6 @@ typedef struct KernelIntegrator {
 	int portal_offset;
 
 	/* bounces */
-	int min_bounce;
 	int max_bounce;
 
 	int max_diffuse_bounce;
@@ -1088,8 +1248,9 @@ typedef struct KernelIntegrator {
 	int max_transmission_bounce;
 	int max_volume_bounce;
 
+	int ao_bounces;
+
 	/* transparent */
-	int transparent_min_bounce;
 	int transparent_max_bounce;
 	int transparent_shadows;
 
@@ -1107,6 +1268,7 @@ typedef struct KernelIntegrator {
 
 	/* branched path */
 	int branched;
+	int volume_decoupled;
 	int diffuse_samples;
 	int glossy_samples;
 	int transmission_samples;
@@ -1131,7 +1293,7 @@ typedef struct KernelIntegrator {
 
 	float light_inv_rr_threshold;
 
-	int pad1;
+	int start_sample;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
 
@@ -1143,7 +1305,8 @@ typedef struct KernelBVH {
 	int have_curves;
 	int have_instancing;
 	int use_qbvh;
-	int pad1, pad2;
+	int use_bvh_steps;
+	int pad1;
 } KernelBVH;
 static_assert_align(KernelBVH, 16);
 
@@ -1185,19 +1348,6 @@ typedef struct KernelData {
 } KernelData;
 static_assert_align(KernelData, 16);
 
-#ifdef __KERNEL_DEBUG__
-/* NOTE: This is a runtime-only struct, alignment is not
- * really important here.
- */
-typedef ccl_addr_space struct DebugData {
-	// Total number of BVH node traversal steps and primitives intersections
-	// for the camera rays.
-	int num_bvh_traversal_steps;
-	int num_bvh_traversed_instances;
-	int num_ray_bounces;
-} DebugData;
-#endif
-
 /* Declarations required for split kernel */
 
 /* Macro for queues */
@@ -1210,7 +1360,6 @@ typedef ccl_addr_space struct DebugData {
  * Queue 3 - Shadow ray cast kernel - AO
  * Queeu 4 - Shadow ray cast kernel - direct lighting
  */
-#define NUM_QUEUES 4
 
 /* Queue names */
 enum QueueNumber {
@@ -1223,45 +1372,75 @@ enum QueueNumber {
 	 * 3. Rays to be regenerated
 	 * are enqueued here.
 	 */
-	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS = 1,
+	QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
 
 	/* All rays for which a shadow ray should be cast to determine radiance
 	 * contribution for AO are enqueued here.
 	 */
-	QUEUE_SHADOW_RAY_CAST_AO_RAYS = 2,
+	QUEUE_SHADOW_RAY_CAST_AO_RAYS,
 
 	/* All rays for which a shadow ray should be cast to determine radiance
 	 * contributing for direct lighting are enqueued here.
 	 */
-	QUEUE_SHADOW_RAY_CAST_DL_RAYS = 3,
+	QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+
+	/* Rays sorted according to shader->id */
+	QUEUE_SHADER_SORTED_RAYS,
+
+#ifdef __BRANCHED_PATH__
+	/* All rays moving to next iteration of the indirect loop for light */
+	QUEUE_LIGHT_INDIRECT_ITER,
+	/* Queue of all inactive rays. These are candidates for sharing work of indirect loops */
+	QUEUE_INACTIVE_RAYS,
+#  ifdef __VOLUME__
+	/* All rays moving to next iteration of the indirect loop for volumes */
+	QUEUE_VOLUME_INDIRECT_ITER,
+#  endif
+#  ifdef __SUBSURFACE__
+	/* All rays moving to next iteration of the indirect loop for subsurface */
+	QUEUE_SUBSURFACE_INDIRECT_ITER,
+#  endif
+#endif  /* __BRANCHED_PATH__ */
+
+	NUM_QUEUES
 };
 
-/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
-#define RAY_STATE_MASK 0x007
-#define RAY_FLAG_MASK 0x0F8
+/* We use RAY_STATE_MASK to get ray_state */
+#define RAY_STATE_MASK 0x0F
+#define RAY_FLAG_MASK 0xF0
 enum RayState {
+	RAY_INVALID = 0,
 	/* Denotes ray is actively involved in path-iteration. */
-	RAY_ACTIVE = 0,
+	RAY_ACTIVE,
 	/* Denotes ray has completed processing all samples and is inactive. */
-	RAY_INACTIVE = 1,
+	RAY_INACTIVE,
 	/* Denoted ray has exited path-iteration and needs to update output buffer. */
-	RAY_UPDATE_BUFFER = 2,
+	RAY_UPDATE_BUFFER,
 	/* Donotes ray has hit background */
-	RAY_HIT_BACKGROUND = 3,
+	RAY_HIT_BACKGROUND,
 	/* Denotes ray has to be regenerated */
-	RAY_TO_REGENERATE = 4,
+	RAY_TO_REGENERATE,
 	/* Denotes ray has been regenerated */
-	RAY_REGENERATED = 5,
-	/* Denotes ray should skip direct lighting */
-	RAY_SKIP_DL = 6,
-	/* Flag's ray has to execute shadow blocked function in AO part */
-	RAY_SHADOW_RAY_CAST_AO = 16,
-	/* Flag's ray has to execute shadow blocked function in direct lighting part. */
-	RAY_SHADOW_RAY_CAST_DL = 32,
+	RAY_REGENERATED,
+	/* Denotes ray is moving to next iteration of the branched indirect loop */
+	RAY_LIGHT_INDIRECT_NEXT_ITER,
+	RAY_VOLUME_INDIRECT_NEXT_ITER,
+	RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
+
+	/* Ray flags */
+
+	/* Flags to denote that the ray is currently evaluating the branched indirect loop */
+	RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
+	RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
+	RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
+	RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT),
+
+	/* Ray is evaluating an iteration of an indirect loop for another thread */
+	RAY_BRANCHED_INDIRECT_SHARED = (1 << 7),
 };
 
 #define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define IS_STATE(ray_state, ray_index, state) ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
 #define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag))
 #define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
 #define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
@@ -1276,6 +1455,20 @@ enum RayState {
 #define PATCH_MAP_NODE_IS_LEAF (1u << 31)
 #define PATCH_MAP_NODE_INDEX_MASK (~(PATCH_MAP_NODE_IS_SET | PATCH_MAP_NODE_IS_LEAF))
 
+/* Work Tiles */
+
+typedef struct WorkTile {
+	uint x, y, w, h;
+
+	uint start_sample;
+	uint num_samples;
+
+	uint offset;
+	uint stride;
+
+	ccl_global float *buffer;
+} WorkTile;
+
 CCL_NAMESPACE_END
 
 #endif /*  __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index e973afe79eb..35f58850f56 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -41,12 +41,12 @@ typedef struct VolumeShaderCoefficients {
 /* evaluate shader to get extinction coefficient at P */
 ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
                                                        ShaderData *sd,
-                                                       PathState *state,
+                                                       ccl_addr_space PathState *state,
                                                        float3 P,
                                                        float3 *extinction)
 {
 	sd->P = P;
-	shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
+	shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW);
 
 	if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER)))
 		return false;
@@ -67,12 +67,12 @@ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
 /* evaluate shader to get absorption, scattering and emission at P */
 ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
                                             ShaderData *sd,
-                                            PathState *state,
+                                            ccl_addr_space PathState *state,
                                             float3 P,
                                             VolumeShaderCoefficients *coeff)
 {
 	sd->P = P;
-	shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, SHADER_CONTEXT_VOLUME);
+	shader_eval_volume(kg, sd, state, state->volume_stack, state->flag);
 
 	if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER|SD_EMISSION)))
 		return false;
@@ -115,7 +115,7 @@ ccl_device float kernel_volume_channel_get(float3 value, int channel)
 	return (channel == 0)? value.x: ((channel == 1)? value.y: value.z);
 }
 
-ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *stack)
+ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, ccl_addr_space VolumeStack *stack)
 {
 	for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
 		int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*SHADER_SIZE);
@@ -164,7 +164,11 @@ ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stac
 
 /* homogeneous volume: assume shader evaluation at the starts gives
  * the extinction coefficient for the entire line segment */
-ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
+ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg,
+                                                 ccl_addr_space PathState *state,
+                                                 Ray *ray,
+                                                 ShaderData *sd,
+                                                 float3 *throughput)
 {
 	float3 sigma_t;
 
@@ -206,7 +210,11 @@ ccl_device_inline bool kernel_volume_integrate_shadow_ray(
 
 /* heterogeneous volume: integrate stepping through the volume until we
  * reach the end, get absorbed entirely, or run out of iterations */
-ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
+ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
+                                                   ccl_addr_space PathState *state,
+                                                   Ray *ray,
+                                                   ShaderData *sd,
+                                                   float3 *throughput)
 {
 	float3 tp = *throughput;
 	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
@@ -214,7 +222,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
 	float step = kernel_data.integrator.volume_step_size;
-	float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step;
+	float random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * step;
 
 	/* compute extinction at the start */
 	float t = 0.0f;
@@ -295,7 +303,11 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 
 /* get the volume attenuation over line segment defined by ray, with the
  * assumption that there are no surfaces blocking light between the endpoints */
-ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *throughput)
+ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg,
+                                              ShaderData *shadow_sd,
+                                              ccl_addr_space PathState *state,
+                                              Ray *ray,
+                                              float3 *throughput)
 {
 	shader_setup_from_volume(kg, shadow_sd, ray);
 
@@ -313,11 +325,18 @@ ccl_device float kernel_volume_equiangular_sample(Ray *ray, float3 light_P, floa
 	float t = ray->t;
 
 	float delta = dot((light_P - ray->P) , ray->D);
-	float D = sqrtf(len_squared(light_P - ray->P) - delta * delta);
+	float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+	if(UNLIKELY(D == 0.0f)) {
+		*pdf = 0.0f;
+		return 0.0f;
+	}
 	float theta_a = -atan2f(delta, D);
 	float theta_b = atan2f(t - delta, D);
 	float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
-
+	if(UNLIKELY(theta_b == theta_a)) {
+		*pdf = 0.0f;
+		return 0.0f;
+	}
 	*pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
 
 	return min(t, delta + t_); /* min is only for float precision errors */
@@ -326,13 +345,19 @@ ccl_device float kernel_volume_equiangular_sample(Ray *ray, float3 light_P, floa
 ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float sample_t)
 {
 	float delta = dot((light_P - ray->P) , ray->D);
-	float D = sqrtf(len_squared(light_P - ray->P) - delta * delta);
+	float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+	if(UNLIKELY(D == 0.0f)) {
+		return 0.0f;
+	}
 
 	float t = ray->t;
 	float t_ = sample_t - delta;
 
 	float theta_a = -atan2f(delta, D);
 	float theta_b = atan2f(t - delta, D);
+	if(UNLIKELY(theta_b == theta_a)) {
+		return 0.0f;
+	}
 
 	float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
 
@@ -396,9 +421,14 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe
 
 /* homogeneous volume: assume shader evaluation at the start gives
  * the volume shading coefficient for the entire line segment */
-ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg,
-	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput,
-	RNG *rng, bool probalistic_scatter)
+ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(
+    KernelGlobals *kg,
+    ccl_addr_space PathState *state,
+    Ray *ray,
+    ShaderData *sd,
+    PathRadiance *L,
+    ccl_addr_space float3 *throughput,
+    bool probalistic_scatter)
 {
 	VolumeShaderCoefficients coeff;
 
@@ -417,13 +447,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 
 		/* pick random color channel, we use the Veach one-sample
 		 * model with balance heuristic for the channels */
-		float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+		float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
 		int channel = (int)(rphase*3.0f);
-		sd->randb_closure = rphase*3.0f - channel;
 
 		/* decide if we will hit or miss */
 		bool scatter = true;
-		float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+		float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
 
 		if(probalistic_scatter) {
 			float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
@@ -476,7 +505,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 		float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
 		float3 transmittance = volume_color_transmittance(sigma_t, ray->t);
 		float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t);
-		path_radiance_accum_emission(L, *throughput, emission, state->bounce);
+		path_radiance_accum_emission(L, state, *throughput, emission);
 	}
 
 	/* modify throughput */
@@ -496,17 +525,27 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
 }
 
 ccl_device_inline VolumeIntegrateResult kernel_volume_integrate_ray(
-        KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd,
-        PathRadiance *L, float3 *throughput, float t, float new_t,
-        float random_jitter_offset, bool has_scatter, float3 *accum_transmittance,
-        int channel, const float tp_eps, float *xi)
+        KernelGlobals *kg,
+        PathState *state,
+        Ray *ray,
+        ShaderData *sd,
+        PathRadiance *L,
+        float3 *throughput,
+        float t,
+        float new_t,
+        float random_jitter_offset,
+        bool has_scatter,
+        float3 *accum_transmittance,
+        int channel,
+        const float tp_eps,
+        float *xi)
 {
 	float dt = new_t - t;
 	float3 tp = *throughput;
 
 	/* use random position inside this segment to sample shader */
 	if(new_t == ray->t)
-		random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt;
+		random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * dt;
 
 	float3 new_P = ray->P + ray->D * (t + random_jitter_offset);
 	VolumeShaderCoefficients coeff;
@@ -568,7 +607,7 @@ ccl_device_inline VolumeIntegrateResult kernel_volume_integrate_ray(
 		/* integrate emission attenuated by absorption */
 		if(L && (closure_flag & SD_EMISSION)) {
 			float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt);
-			path_radiance_accum_emission(L, tp, emission, state->bounce);
+			path_radiance_accum_emission(L, state, tp, emission);
 		}
 
 		/* modify throughput */
@@ -606,7 +645,7 @@ ccl_device_inline VolumeIntegrateResult kernel_volume_integrate_ray(
  * iterations. this does probabilistically scatter or get transmitted through
  * for path tracing where we don't want to branch. */
 ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
-	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng)
+	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput)
 {
 	VolumeIntegrateResult result = VOLUME_PATH_MISSED;
 	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
@@ -614,7 +653,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
 	float step_size = kernel_data.integrator.volume_step_size;
-	float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size;
+	float random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * step_size;
 
 	/* compute coefficients at the start */
 	float t = 0.0f;
@@ -622,8 +661,8 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 
 	/* pick random color channel, we use the Veach one-sample
 	 * model with balance heuristic for the channels */
-	float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
-	float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+	float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
+	float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
 	int channel = (int)(rphase*3.0f);
 	sd->randb_closure = rphase*3.0f - channel;
 	bool has_scatter = false;
@@ -702,22 +741,24 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
  * ray, with the assumption that there are no surfaces blocking light
  * between the endpoints. distance sampling is used to decide if we will
  * scatter or not. */
-ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
-	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng, bool heterogeneous)
+ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(
+    KernelGlobals *kg,
+    ccl_addr_space PathState *state,
+    ShaderData *sd,
+    Ray *ray,
+    PathRadiance *L,
+    ccl_addr_space float3 *throughput,
+    bool heterogeneous)
 {
-	/* workaround to fix correlation bug in T38710, can find better solution
-	 * in random number generator later, for now this is done here to not impact
-	 * performance of rendering without volumes */
-	RNG tmp_rng = cmj_hash(*rng, state->rng_offset);
-
 	shader_setup_from_volume(kg, sd, ray);
 
 	if(heterogeneous)
-		return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, &tmp_rng);
+		return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput);
 	else
-		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng, true);
+		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true);
 }
 
+#ifndef __SPLIT_KERNEL__
 /* Decoupled Volume Sampling
  *
  * VolumeSegment is list of coefficients and transmittance stored at all steps
@@ -756,6 +797,7 @@ typedef struct VolumeSegment {
  * but the entire segment is needed to do always scattering, rather than probabilistically
  * hitting or missing the volume. if we don't know the transmittance at the end of the
  * volume we can't generate stratified distance samples up to that transmittance */
+#ifdef __VOLUME_DECOUPLED__
 ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg, PathState *state,
 	Ray *ray, ShaderData *sd, VolumeSegment *segment, bool heterogeneous)
 {
@@ -1026,6 +1068,7 @@ ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *s
 #endif
 	}
 }
+#endif  /* __VOLUME_DECOUPLED__ */
 
 /* scattering for homogeneous and heterogeneous volumes, using decoupled ray
  * marching.
@@ -1041,7 +1084,6 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	/* pick random color channel, we use the Veach one-sample
 	 * model with balance heuristic for the channels */
 	int channel = (int)(rphase*3.0f);
-	sd->randb_closure = rphase*3.0f - channel;
 	float xi = rscatter;
 
 	/* probabilistic scattering decision based on transmittance */
@@ -1195,6 +1237,9 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 			mis_weight = 2.0f*power_heuristic(pdf, distance_pdf);
 		}
 	}
+	if(sample_t < 0.0f || pdf == 0.0f) {
+		return VOLUME_PATH_MISSED;
+	}
 
 	/* compute transmittance up to this step */
 	if(step != segment->steps)
@@ -1216,6 +1261,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 
 	return VOLUME_PATH_SCATTERED;
 }
+#endif /* __SPLIT_KERNEL */
 
 /* decide if we need to use decoupled or not */
 ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method)
@@ -1223,6 +1269,9 @@ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneou
 	/* decoupled ray marching for heterogeneous volumes not supported on the GPU,
 	 * which also means equiangular and multiple importance sampling is not
 	 * support for that case */
+	if(!kernel_data.integrator.volume_decoupled)
+		return false;
+
 #ifdef __KERNEL_GPU__
 	if(heterogeneous)
 		return false;
@@ -1247,9 +1296,9 @@ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneou
 
 ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
                                          ShaderData *stack_sd,
-                                         const PathState *state,
-                                         const Ray *ray,
-                                         VolumeStack *stack)
+                                         ccl_addr_space const PathState *state,
+                                         ccl_addr_space const Ray *ray,
+                                         ccl_addr_space VolumeStack *stack)
 {
 	/* NULL ray happens in the baker, does it need proper initialization of
 	 * camera in volume?
@@ -1393,7 +1442,7 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
 	}
 }
 
-ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, VolumeStack *stack)
+ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, ccl_addr_space VolumeStack *stack)
 {
 	/* todo: we should have some way for objects to indicate if they want the
 	 * world shader to work inside them. excluding it by default is problematic
@@ -1442,7 +1491,7 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd
 ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
                                                           ShaderData *stack_sd,
                                                           Ray *ray,
-                                                          VolumeStack *stack)
+                                                          ccl_addr_space VolumeStack *stack)
 {
 	kernel_assert(kernel_data.integrator.use_volumes);
 
@@ -1489,4 +1538,30 @@ ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
 }
 #endif
 
+/* Clean stack after the last bounce.
+ *
+ * It is expected that all volumes are closed manifolds, so at the time when ray
+ * hits nothing (for example, it is a last bounce which goes to environment) the
+ * only expected volume in the stack is the world's one. All the rest volume
+ * entries should have been exited already.
+ *
+ * This isn't always true because of ray intersection precision issues, which
+ * could lead us to an infinite non-world volume in the stack, causing render
+ * artifacts.
+ *
+ * Use this function after the last bounce to get rid of all volumes apart from
+ * the world's one after the last bounce to avoid render artifacts.
+ */
+ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg,
+                                                 ccl_addr_space VolumeStack *volume_stack)
+{
+	if(kernel_data.background.volume_shader != SHADER_NONE) {
+		/* Keep the world's volume in stack. */
+		volume_stack[1].shader = SHADER_NONE;
+	}
+	else {
+		volume_stack[0].shader = SHADER_NONE;
+	}
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 7d559b1aa31..0c2d9379b63 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -17,177 +17,66 @@
 #ifndef __KERNEL_WORK_STEALING_H__
 #define __KERNEL_WORK_STEALING_H__
 
+CCL_NAMESPACE_BEGIN
+
 /*
  * Utility functions for work stealing
  */
 
-#ifdef __WORK_STEALING__
-
 #ifdef __KERNEL_OPENCL__
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif
 
-uint get_group_id_with_ray_index(uint ray_index,
-                                 uint tile_dim_x,
-                                 uint tile_dim_y,
-                                 uint parallel_samples,
-                                 int dim)
+#ifdef __SPLIT_KERNEL__
+/* Returns true if there is work */
+ccl_device bool get_next_work(KernelGlobals *kg,
+                              ccl_global uint *work_pools,
+                              uint total_work_size,
+                              uint ray_index,
+                              ccl_private uint *global_work_index)
 {
-	if(dim == 0) {
-		uint x_span = ray_index % (tile_dim_x * parallel_samples);
-		return x_span / get_local_size(0);
+	/* With a small amount of work there may be more threads than work due to
+	 * rounding up of global size, stop such threads immediately. */
+	if(ray_index >= total_work_size) {
+		return false;
 	}
-	else /*if(dim == 1)*/ {
-		kernel_assert(dim == 1);
-		uint y_span = ray_index / (tile_dim_x * parallel_samples);
-		return y_span / get_local_size(1);
-	}
-}
-
-uint get_total_work(uint tile_dim_x,
-                    uint tile_dim_y,
-                    uint grp_idx,
-                    uint grp_idy,
-                    uint num_samples)
-{
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
-
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
-
-	return threads_within_tile_border_x *
-	       threads_within_tile_border_y *
-	       num_samples;
-}
 
-/* Returns 0 in case there is no next work available */
-/* Returns 1 in case work assigned is valid */
-int get_next_work(ccl_global uint *work_pool,
-                  ccl_private uint *my_work,
-                  uint tile_dim_x,
-                  uint tile_dim_y,
-                  uint num_samples,
-                  uint parallel_samples,
-                  uint ray_index)
-{
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint total_work = get_total_work(tile_dim_x,
-	                                 tile_dim_y,
-	                                 grp_idx,
-	                                 grp_idy,
-	                                 num_samples);
-	uint group_index = grp_idy * get_num_groups(0) + grp_idx;
-	*my_work = atomic_inc(&work_pool[group_index]);
-	return (*my_work < total_work) ? 1 : 0;
-}
+	/* Increase atomic work index counter in pool. */
+	uint pool = ray_index / WORK_POOL_SIZE;
+	uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]);
 
-/* This function assumes that the passed my_work is valid. */
-/* Decode sample number w.r.t. assigned my_work. */
-uint get_my_sample(uint my_work,
-                   uint tile_dim_x,
-                   uint tile_dim_y,
-                   uint parallel_samples,
-                   uint ray_index)
-{
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
+	/* Map per-pool work index to a global work index. */
+	uint global_size = ccl_global_size(0) * ccl_global_size(1);
+	kernel_assert(global_size % WORK_POOL_SIZE == 0);
+	kernel_assert(ray_index < global_size);
 
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
+	*global_work_index = (work_index / WORK_POOL_SIZE) * global_size
+	                   + (pool * WORK_POOL_SIZE)
+	                   + (work_index % WORK_POOL_SIZE);
 
-	return my_work /
-	       (threads_within_tile_border_x * threads_within_tile_border_y);
+	/* Test if all work for this pool is done. */
+	return (*global_work_index < total_work_size);
 }
+#endif
 
-/* Decode pixel and tile position w.r.t. assigned my_work. */
-void get_pixel_tile_position(ccl_private uint *pixel_x,
-                             ccl_private uint *pixel_y,
-                             ccl_private uint *tile_x,
-                             ccl_private uint *tile_y,
-                             uint my_work,
-                             uint tile_dim_x,
-                             uint tile_dim_y,
-                             uint tile_offset_x,
-                             uint tile_offset_y,
-                             uint parallel_samples,
-                             uint ray_index)
+/* Map global work index to tile, pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+                                      uint global_work_index,
+                                      ccl_private uint *x,
+                                      ccl_private uint *y,
+                                      ccl_private uint *sample)
 {
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
-
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
-
-	uint total_associated_pixels =
-		threads_within_tile_border_x * threads_within_tile_border_y;
-	uint work_group_pixel_index = my_work % total_associated_pixels;
-	uint work_group_pixel_x =
-		work_group_pixel_index % threads_within_tile_border_x;
-	uint work_group_pixel_y =
-		work_group_pixel_index / threads_within_tile_border_x;
-
-	*pixel_x =
-		tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
-	*pixel_y =
-		tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
-	*tile_x = *pixel_x - tile_offset_x;
-	*tile_y = *pixel_y - tile_offset_y;
+	uint tile_pixels = tile->w * tile->h;
+	uint sample_offset = global_work_index / tile_pixels;
+	uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+	uint y_offset = pixel_offset / tile->w;
+	uint x_offset = pixel_offset - y_offset * tile->w;
+
+	*x = tile->x + x_offset;
+	*y = tile->y + y_offset;
+	*sample = tile->start_sample + sample_offset;
 }
 
-#endif  /* __WORK_STEALING__ */
+CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
new file mode 100644
index 00000000000..2ff1a392dc3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+#  ifdef __SSE2__
+#    ifndef __KERNEL_SSE2__
+#      define __KERNEL_SSE2__
+#    endif
+#  endif
+#  ifdef __SSE3__
+#    define __KERNEL_SSE3__
+#  endif
+#  ifdef __SSSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#  ifdef __SSE4_1__
+#    define __KERNEL_SSE41__
+#  endif
+#  ifdef __AVX__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX__
+#  endif
+#  ifdef __AVX2__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX2__
+#  endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+    /* do nothing */
+#endif
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
new file mode 100644
index 00000000000..4a9e6047ecf
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
new file mode 100644
index 00000000000..c22ec576254
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
new file mode 100644
index 00000000000..bf13ba62806
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+                                                     TilesInfo *tiles,
+                                                     int x,
+                                                     int y,
+                                                     float *unfilteredA,
+                                                     float *unfilteredB,
+                                                     float *sampleV,
+                                                     float *sampleVV,
+                                                     float *bufferV,
+                                                     int* prefilter_rect,
+                                                     int buffer_pass_stride,
+                                                     int buffer_denoising_offset);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+                                                   TilesInfo *tiles,
+                                                   int m_offset,
+                                                   int v_offset,
+                                                   int x,
+                                                   int y,
+                                                   float *mean,
+                                                   float *variance,
+                                                   int* prefilter_rect,
+                                                   int buffer_pass_stride,
+                                                   int buffer_denoising_offset);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
+                                                       ccl_global float *image,
+                                                       ccl_global float *variance,
+                                                       ccl_global float *depth,
+                                                       ccl_global float *output,
+                                                       int *rect,
+                                                       int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+                                                      float *mean,
+                                                      float *variance,
+                                                      float *a,
+                                                      float *b,
+                                                      int* prefilter_rect,
+                                                      int r);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           int x,
+                                                           int y,
+                                                           int storage_ofs,
+                                                           float *transform,
+                                                           int *rank,
+                                                           int* rect,
+                                                           int pass_stride,
+                                                           int radius,
+                                                           float pca_threshold);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+                                                           int dy,
+                                                           float *weight_image,
+                                                           float *variance,
+                                                           float *difference_image,
+                                                           int* rect,
+                                                           int w,
+                                                           int channel_offset,
+                                                           float a,
+                                                           float k_2);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
+                                                float *out_image,
+                                                int* rect,
+                                                int w,
+                                                int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
+                                                       float *out_image,
+                                                       int* rect,
+                                                       int w,
+                                                       int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+                                                         int dy,
+                                                         float *difference_image,
+                                                         float *image,
+                                                         float *out_image,
+                                                         float *accum_image,
+                                                         int* rect,
+                                                         int w,
+                                                         int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+                                                             int dy,
+                                                             float *difference_image,
+                                                             float *buffer,
+                                                             float *transform,
+                                                             int *rank,
+                                                             float *XtWX,
+                                                             float3 *XtWY,
+                                                             int *rect,
+                                                             int *filter_rect,
+                                                             int w,
+                                                             int h,
+                                                             int f,
+                                                             int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
+                                                     float *accum_image,
+                                                     int* rect,
+                                                     int w);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+                                                int y,
+                                                int storage_ofs,
+                                                int w,
+                                                int h,
+                                                float *buffer,
+                                                int *rank,
+                                                float *XtWX,
+                                                float3 *XtWY,
+                                                int *buffer_params,
+                                                int sample);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
new file mode 100644
index 00000000000..2fbb0ea2bdb
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#include "kernel/kernel_compat_cpu.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+#ifdef KERNEL_STUB
+#  include "util/util_debug.h"
+#  define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+
+/* Denoise filter */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+                                                     TilesInfo *tiles,
+                                                     int x,
+                                                     int y,
+                                                     float *unfilteredA,
+                                                     float *unfilteredB,
+                                                     float *sampleVariance,
+                                                     float *sampleVarianceV,
+                                                     float *bufferVariance,
+                                                     int* prefilter_rect,
+                                                     int buffer_pass_stride,
+                                                     int buffer_denoising_offset)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
+#else
+	kernel_filter_divide_shadow(sample, tiles,
+	                            x, y,
+	                            unfilteredA,
+	                            unfilteredB,
+	                            sampleVariance,
+	                            sampleVarianceV,
+	                            bufferVariance,
+	                            load_int4(prefilter_rect),
+	                            buffer_pass_stride,
+	                            buffer_denoising_offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+                                                   TilesInfo *tiles,
+                                                   int m_offset,
+                                                   int v_offset,
+                                                   int x,
+                                                   int y,
+                                                   float *mean, float *variance,
+                                                   int* prefilter_rect,
+                                                   int buffer_pass_stride,
+                                                   int buffer_denoising_offset)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
+#else
+	kernel_filter_get_feature(sample, tiles,
+	                          m_offset, v_offset,
+	                          x, y,
+	                          mean, variance,
+	                          load_int4(prefilter_rect),
+	                          buffer_pass_stride,
+	                          buffer_denoising_offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
+                                                       ccl_global float *image,
+                                                       ccl_global float *variance,
+                                                       ccl_global float *depth,
+                                                       ccl_global float *output,
+                                                       int *rect,
+                                                       int pass_stride)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers);
+#else
+	kernel_filter_detect_outliers(x, y, image, variance, depth, output, load_int4(rect), pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+                                                      float *mean,
+                                                      float *variance,
+                                                      float *a,
+                                                      float *b,
+                                                      int* prefilter_rect,
+                                                      int r)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
+#else
+	kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           int x,
+                                                           int y,
+                                                           int storage_ofs,
+                                                           float *transform,
+                                                           int *rank,
+                                                           int* prefilter_rect,
+                                                           int pass_stride,
+                                                           int radius,
+                                                           float pca_threshold)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
+#else
+  rank += storage_ofs;
+  transform += storage_ofs*TRANSFORM_SIZE;
+	kernel_filter_construct_transform(buffer,
+	                                  x, y,
+	                                  load_int4(prefilter_rect),
+	                                  pass_stride,
+	                                  transform,
+	                                  rank,
+	                                  radius,
+	                                  pca_threshold);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+                                                           int dy,
+                                                           float *weight_image,
+                                                           float *variance,
+                                                           float *difference_image,
+                                                           int *rect,
+                                                           int w,
+                                                           int channel_offset,
+                                                           float a,
+                                                           float k_2)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
+#else
+	kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), w, channel_offset, a, k_2);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
+                                                float *out_image,
+                                                int *rect,
+                                                int w,
+                                                int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
+#else
+	kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
+                                                       float *out_image,
+                                                       int *rect,
+                                                       int w,
+                                                       int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
+#else
+	kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+                                                         int dy,
+                                                         float *difference_image,
+                                                         float *image,
+                                                         float *out_image,
+                                                         float *accum_image,
+                                                         int *rect,
+                                                         int w,
+                                                         int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
+#else
+	kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+                                                             int dy,
+                                                             float *difference_image,
+                                                             float *buffer,
+                                                             float *transform,
+                                                             int *rank,
+                                                             float *XtWX,
+                                                             float3 *XtWY,
+                                                             int *rect,
+                                                             int *filter_rect,
+                                                             int w,
+                                                             int h,
+                                                             int f,
+                                                             int pass_stride)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
+#else
+    kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
+                                                     float *accum_image,
+                                                     int *rect,
+                                                     int w)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
+#else
+	kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), w);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+                                                int y,
+                                                int storage_ofs,
+                                                int w,
+                                                int h,
+                                                float *buffer,
+                                                int *rank,
+                                                float *XtWX,
+                                                float3 *XtWY,
+                                                int *buffer_params,
+                                                int sample)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_finalize);
+#else
+	XtWX += storage_ofs*XTWX_SIZE;
+	XtWY += storage_ofs*XTWY_SIZE;
+	rank += storage_ofs;
+	kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
+#endif
+}
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
new file mode 100644
index 00000000000..f7c9935f1d0
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
new file mode 100644
index 00000000000..070b95a3505
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
new file mode 100644
index 00000000000..254025be4e2
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index 72dbbd9a416..7679ab4f111 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -56,9 +56,9 @@
     /* do nothing */
 #endif
 
-#include "kernel.h"
+#include "kernel/kernel.h"
 #define KERNEL_ARCH cpu
-#include "kernel_cpu_impl.h"
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -84,112 +84,16 @@ void kernel_tex_copy(KernelGlobals *kg,
 	if(0) {
 	}
 
-#define KERNEL_TEX(type, ttype, tname) \
+#define KERNEL_TEX(type, tname) \
 	else if(strcmp(name, #tname) == 0) { \
 		kg->tname.data = (type*)mem; \
 		kg->tname.width = width; \
 	}
-#define KERNEL_IMAGE_TEX(type, ttype, tname)
-#include "kernel_textures.h"
-
-	else if(strstr(name, "__tex_image_float4")) {
-		texture_image_float4 *tex = NULL;
-		int id = atoi(name + strlen("__tex_image_float4_"));
-		int array_index = id;
-
-		if(array_index >= 0 && array_index < TEX_NUM_FLOAT4_CPU) {
-			tex = &kg->texture_float4_images[array_index];
-		}
-
-		if(tex) {
-			tex->data = (float4*)mem;
-			tex->dimensions_set(width, height, depth);
-			tex->interpolation = interpolation;
-			tex->extension = extension;
-		}
-	}
-	else if(strstr(name, "__tex_image_float")) {
-		texture_image_float *tex = NULL;
-		int id = atoi(name + strlen("__tex_image_float_"));
-		int array_index = id - TEX_START_FLOAT_CPU;
-
-		if(array_index >= 0 && array_index < TEX_NUM_FLOAT_CPU) {
-			tex = &kg->texture_float_images[array_index];
-		}
-
-		if(tex) {
-			tex->data = (float*)mem;
-			tex->dimensions_set(width, height, depth);
-			tex->interpolation = interpolation;
-			tex->extension = extension;
-		}
-	}
-	else if(strstr(name, "__tex_image_byte4")) {
-		texture_image_uchar4 *tex = NULL;
-		int id = atoi(name + strlen("__tex_image_byte4_"));
-		int array_index = id - TEX_START_BYTE4_CPU;
-
-		if(array_index >= 0 && array_index < TEX_NUM_BYTE4_CPU) {
-			tex = &kg->texture_byte4_images[array_index];
-		}
-
-		if(tex) {
-			tex->data = (uchar4*)mem;
-			tex->dimensions_set(width, height, depth);
-			tex->interpolation = interpolation;
-			tex->extension = extension;
-		}
-	}
-	else if(strstr(name, "__tex_image_byte")) {
-		texture_image_uchar *tex = NULL;
-		int id = atoi(name + strlen("__tex_image_byte_"));
-		int array_index = id - TEX_START_BYTE_CPU;
-
-		if(array_index >= 0 && array_index < TEX_NUM_BYTE_CPU) {
-			tex = &kg->texture_byte_images[array_index];
-		}
-
-		if(tex) {
-			tex->data = (uchar*)mem;
-			tex->dimensions_set(width, height, depth);
-			tex->interpolation = interpolation;
-			tex->extension = extension;
-		}
-	}
-	else if(strstr(name, "__tex_image_half4")) {
-		texture_image_half4 *tex = NULL;
-		int id = atoi(name + strlen("__tex_image_half4_"));
-		int array_index = id - TEX_START_HALF4_CPU;
-
-		if(array_index >= 0 && array_index < TEX_NUM_HALF4_CPU) {
-			tex = &kg->texture_half4_images[array_index];
-		}
-
-		if(tex) {
-			tex->data = (half4*)mem;
-			tex->dimensions_set(width, height, depth);
-			tex->interpolation = interpolation;
-			tex->extension = extension;
-		}
-	}
-	else if(strstr(name, "__tex_image_half")) {
-		texture_image_half *tex = NULL;
-		int id = atoi(name + strlen("__tex_image_half_"));
-		int array_index = id - TEX_START_HALF_CPU;
-
-		if(array_index >= 0 && array_index < TEX_NUM_HALF_CPU) {
-			tex = &kg->texture_half_images[array_index];
-		}
-
-		if(tex) {
-			tex->data = (half*)mem;
-			tex->dimensions_set(width, height, depth);
-			tex->interpolation = interpolation;
-			tex->extension = extension;
-		}
-	}
-	else
+#define KERNEL_IMAGE_TEX(type, tname)
+#include "kernel/kernel_textures.h"
+	else {
 		assert(0);
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index 1350d9e5c2e..a645fb4d8dd 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -17,21 +17,23 @@
 /* Optimized CPU kernel entry points. This file is compiled with AVX
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
- 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#endif
 
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  include "kernel.h"
-#  define KERNEL_ARCH cpu_avx
-#  include "kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index 1a416e771ee..6bbb87727b9 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -18,21 +18,23 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#  define __KERNEL_AVX2__
-#endif
-
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  include "kernel.h"
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 1a07c705f1c..6bdb8546a24 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -18,7 +18,6 @@
 
 void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
                                            float *buffer,
-                                           unsigned int *rng_state,
                                            int sample,
                                            int x, int y,
                                            int offset,
@@ -42,11 +41,50 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        uint4 *input,
                                        float4 *output,
-                                       float *output_luma,
                                        int type,
                                        int filter,
                                        int i,
                                        int offset,
                                        int sample);
 
+/* Split kernels */
+
+void KERNEL_FUNCTION_FULL_NAME(data_init)(
+        KernelGlobals *kg,
+        ccl_constant KernelData *data,
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
+        ccl_global int *Queue_index,
+        int queuesize,
+        ccl_global char *use_queues_flag,
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+        ccl_global float *buffer);
+
+#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData *data);
+
+DECLARE_SPLIT_KERNEL_FUNCTION(path_init)
+DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DECLARE_SPLIT_KERNEL_FUNCTION(do_volume)
+DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
+DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
+DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
+DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
+DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
+DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
+
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
index af68907a5c2..37ba0f692be 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
@@ -17,62 +17,478 @@
 #ifndef __KERNEL_CPU_IMAGE_H__
 #define __KERNEL_CPU_IMAGE_H__
 
-#ifdef __KERNEL_CPU__
-
 CCL_NAMESPACE_BEGIN
 
-ccl_device float4 kernel_tex_image_interp_impl(KernelGlobals *kg, int tex, float x, float y)
-{
-	if(tex >= TEX_START_HALF_CPU)
-		return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp(x, y);
-	else if(tex >= TEX_START_BYTE_CPU)
-		return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp(x, y);
-	else if(tex >= TEX_START_FLOAT_CPU)
-		return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp(x, y);
-	else if(tex >= TEX_START_HALF4_CPU)
-		return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp(x, y);
-	else if(tex >= TEX_START_BYTE4_CPU)
-		return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp(x, y);
-	else
-		return kg->texture_float4_images[tex].interp(x, y);
-}
+template<typename T> struct TextureInterpolator  {
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+	{ \
+		u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
+		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
+		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
+		u[3] = (1.0f / 6.0f) * t * t * t; \
+	} (void)0
+
+	static ccl_always_inline float4 read(float4 r)
+	{
+		return r;
+	}
+
+	static ccl_always_inline float4 read(uchar4 r)
+	{
+		float f = 1.0f/255.0f;
+		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
+	}
+
+	static ccl_always_inline float4 read(uchar r)
+	{
+		float f = r*(1.0f/255.0f);
+		return make_float4(f, f, f, 1.0f);
+	}
+
+	static ccl_always_inline float4 read(float r)
+	{
+		/* TODO(dingto): Optimize this, so interpolation
+		 * happens on float instead of float4 */
+		return make_float4(r, r, r, 1.0f);
+	}
+
+	static ccl_always_inline float4 read(half4 r)
+	{
+		return half4_to_float4(r);
+	}
+
+	static ccl_always_inline float4 read(half r)
+	{
+		float f = half_to_float(r);
+		return make_float4(f, f, f, 1.0f);
+	}
+
+	static ccl_always_inline int wrap_periodic(int x, int width)
+	{
+		x %= width;
+		if(x < 0)
+			x += width;
+		return x;
+	}
+
+	static ccl_always_inline int wrap_clamp(int x, int width)
+	{
+		return clamp(x, 0, width-1);
+	}
+
+	static ccl_always_inline float frac(float x, int *ix)
+	{
+		int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
+		*ix = i;
+		return x - (float)i;
+	}
+
+	static ccl_always_inline float4 interp(const TextureInfo& info, float x, float y)
+	{
+		if(UNLIKELY(!info.data))
+			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+		const T *data = (const T*)info.data;
+		int width = info.width;
+		int height = info.height;
+		int ix, iy, nix, niy;
+
+		if(info.interpolation == INTERPOLATION_CLOSEST) {
+			frac(x*(float)width, &ix);
+			frac(y*(float)height, &iy);
+			switch(info.extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					ATTR_FALLTHROUGH;
+				case EXTENSION_EXTEND:
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
+				default:
+					kernel_assert(0);
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+			}
+			return read(data[ix + iy*width]);
+		}
+		else if(info.interpolation == INTERPOLATION_LINEAR) {
+			float tx = frac(x*(float)width - 0.5f, &ix);
+			float ty = frac(y*(float)height - 0.5f, &iy);
+
+			switch(info.extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					ATTR_FALLTHROUGH;
+				case EXTENSION_EXTEND:
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
+				default:
+					kernel_assert(0);
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+			}
+
+			float4 r = (1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width]);
+			r += (1.0f - ty)*tx*read(data[nix + iy*width]);
+			r += ty*(1.0f - tx)*read(data[ix + niy*width]);
+			r += ty*tx*read(data[nix + niy*width]);
+
+			return r;
+		}
+		else {
+			/* Bicubic b-spline interpolation. */
+			float tx = frac(x*(float)width - 0.5f, &ix);
+			float ty = frac(y*(float)height - 0.5f, &iy);
+			int pix, piy, nnix, nniy;
+			switch(info.extension) {
+				case EXTENSION_REPEAT:
+					ix = wrap_periodic(ix, width);
+					iy = wrap_periodic(iy, height);
+
+					pix = wrap_periodic(ix-1, width);
+					piy = wrap_periodic(iy-1, height);
+
+					nix = wrap_periodic(ix+1, width);
+					niy = wrap_periodic(iy+1, height);
+
+					nnix = wrap_periodic(ix+2, width);
+					nniy = wrap_periodic(iy+2, height);
+					break;
+				case EXTENSION_CLIP:
+					if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+						return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+					}
+					ATTR_FALLTHROUGH;
+				case EXTENSION_EXTEND:
+					pix = wrap_clamp(ix-1, width);
+					piy = wrap_clamp(iy-1, height);
+
+					nix = wrap_clamp(ix+1, width);
+					niy = wrap_clamp(iy+1, height);
+
+					nnix = wrap_clamp(ix+2, width);
+					nniy = wrap_clamp(iy+2, height);
+
+					ix = wrap_clamp(ix, width);
+					iy = wrap_clamp(iy, height);
+					break;
+				default:
+					kernel_assert(0);
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+			}
+
+			const int xc[4] = {pix, ix, nix, nnix};
+			const int yc[4] = {width * piy,
+			                   width * iy,
+			                   width * niy,
+			                   width * nniy};
+			float u[4], v[4];
+			/* Some helper macro to keep code reasonable size,
+			 * let compiler to inline all the matrix multiplications.
+			 */
+#define DATA(x, y) (read(data[xc[x] + yc[y]]))
+#define TERM(col) \
+			(v[col] * (u[0] * DATA(0, col) + \
+			           u[1] * DATA(1, col) + \
+			           u[2] * DATA(2, col) + \
+			           u[3] * DATA(3, col)))
+
+			SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+			SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+			/* Actual interpolation. */
+			return TERM(0) + TERM(1) + TERM(2) + TERM(3);
+
+#undef TERM
+#undef DATA
+		}
+	}
+
+	static ccl_always_inline float4 interp_3d_closest(const TextureInfo& info, float x, float y, float z)
+	{
+		int width = info.width;
+		int height = info.height;
+		int depth = info.depth;
+		int ix, iy, iz;
 
-ccl_device float4 kernel_tex_image_interp_3d_impl(KernelGlobals *kg, int tex, float x, float y, float z)
+		frac(x*(float)width, &ix);
+		frac(y*(float)height, &iy);
+		frac(z*(float)depth, &iz);
+
+		switch(info.extension) {
+			case EXTENSION_REPEAT:
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+				iz = wrap_periodic(iz, depth);
+				break;
+			case EXTENSION_CLIP:
+				if(x < 0.0f || y < 0.0f || z < 0.0f ||
+				   x > 1.0f || y > 1.0f || z > 1.0f)
+				{
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+				ATTR_FALLTHROUGH;
+			case EXTENSION_EXTEND:
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+				iz = wrap_clamp(iz, depth);
+				break;
+			default:
+				kernel_assert(0);
+				return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+
+		const T *data = (const T*)info.data;
+		return read(data[ix + iy*width + iz*width*height]);
+	}
+
+	static ccl_always_inline float4 interp_3d_linear(const TextureInfo& info, float x, float y, float z)
+	{
+		int width = info.width;
+		int height = info.height;
+		int depth = info.depth;
+		int ix, iy, iz;
+		int nix, niy, niz;
+
+		float tx = frac(x*(float)width - 0.5f, &ix);
+		float ty = frac(y*(float)height - 0.5f, &iy);
+		float tz = frac(z*(float)depth - 0.5f, &iz);
+
+		switch(info.extension) {
+			case EXTENSION_REPEAT:
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+				iz = wrap_periodic(iz, depth);
+
+				nix = wrap_periodic(ix+1, width);
+				niy = wrap_periodic(iy+1, height);
+				niz = wrap_periodic(iz+1, depth);
+				break;
+			case EXTENSION_CLIP:
+				if(x < 0.0f || y < 0.0f || z < 0.0f ||
+				   x > 1.0f || y > 1.0f || z > 1.0f)
+				{
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+				ATTR_FALLTHROUGH;
+			case EXTENSION_EXTEND:
+				nix = wrap_clamp(ix+1, width);
+				niy = wrap_clamp(iy+1, height);
+				niz = wrap_clamp(iz+1, depth);
+
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+				iz = wrap_clamp(iz, depth);
+				break;
+			default:
+				kernel_assert(0);
+				return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+
+		const T *data = (const T*)info.data;
+		float4 r;
+
+		r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + iz*width*height]);
+		r += (1.0f - tz)*(1.0f - ty)*tx*read(data[nix + iy*width + iz*width*height]);
+		r += (1.0f - tz)*ty*(1.0f - tx)*read(data[ix + niy*width + iz*width*height]);
+		r += (1.0f - tz)*ty*tx*read(data[nix + niy*width + iz*width*height]);
+
+		r += tz*(1.0f - ty)*(1.0f - tx)*read(data[ix + iy*width + niz*width*height]);
+		r += tz*(1.0f - ty)*tx*read(data[nix + iy*width + niz*width*height]);
+		r += tz*ty*(1.0f - tx)*read(data[ix + niy*width + niz*width*height]);
+		r += tz*ty*tx*read(data[nix + niy*width + niz*width*height]);
+
+		return r;
+	}
+
+	/* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are
+	 * causing stack overflow issue in this function unless it is inlined.
+	 *
+	 * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization
+	 * enabled.
+	 */
+#ifdef __GNUC__
+	static ccl_always_inline
+#else
+	static ccl_never_inline
+#endif
+	float4 interp_3d_tricubic(const TextureInfo& info, float x, float y, float z)
+	{
+		int width = info.width;
+		int height = info.height;
+		int depth = info.depth;
+		int ix, iy, iz;
+		int nix, niy, niz;
+		/* Tricubic b-spline interpolation. */
+		const float tx = frac(x*(float)width - 0.5f, &ix);
+		const float ty = frac(y*(float)height - 0.5f, &iy);
+		const float tz = frac(z*(float)depth - 0.5f, &iz);
+		int pix, piy, piz, nnix, nniy, nniz;
+
+		switch(info.extension) {
+			case EXTENSION_REPEAT:
+				ix = wrap_periodic(ix, width);
+				iy = wrap_periodic(iy, height);
+				iz = wrap_periodic(iz, depth);
+
+				pix = wrap_periodic(ix-1, width);
+				piy = wrap_periodic(iy-1, height);
+				piz = wrap_periodic(iz-1, depth);
+
+				nix = wrap_periodic(ix+1, width);
+				niy = wrap_periodic(iy+1, height);
+				niz = wrap_periodic(iz+1, depth);
+
+				nnix = wrap_periodic(ix+2, width);
+				nniy = wrap_periodic(iy+2, height);
+				nniz = wrap_periodic(iz+2, depth);
+				break;
+			case EXTENSION_CLIP:
+				if(x < 0.0f || y < 0.0f || z < 0.0f ||
+				   x > 1.0f || y > 1.0f || z > 1.0f)
+				{
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+				ATTR_FALLTHROUGH;
+			case EXTENSION_EXTEND:
+				pix = wrap_clamp(ix-1, width);
+				piy = wrap_clamp(iy-1, height);
+				piz = wrap_clamp(iz-1, depth);
+
+				nix = wrap_clamp(ix+1, width);
+				niy = wrap_clamp(iy+1, height);
+				niz = wrap_clamp(iz+1, depth);
+
+				nnix = wrap_clamp(ix+2, width);
+				nniy = wrap_clamp(iy+2, height);
+				nniz = wrap_clamp(iz+2, depth);
+
+				ix = wrap_clamp(ix, width);
+				iy = wrap_clamp(iy, height);
+				iz = wrap_clamp(iz, depth);
+				break;
+			default:
+				kernel_assert(0);
+				return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		}
+
+		const int xc[4] = {pix, ix, nix, nnix};
+		const int yc[4] = {width * piy,
+		                   width * iy,
+		                   width * niy,
+		                   width * nniy};
+		const int zc[4] = {width * height * piz,
+		                   width * height * iz,
+		                   width * height * niz,
+		                   width * height * nniz};
+		float u[4], v[4], w[4];
+
+		/* Some helper macro to keep code reasonable size,
+		 * let compiler to inline all the matrix multiplications.
+		 */
+#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
+#define COL_TERM(col, row) \
+		(v[col] * (u[0] * DATA(0, col, row) + \
+		           u[1] * DATA(1, col, row) + \
+		           u[2] * DATA(2, col, row) + \
+		           u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+		(w[row] * (COL_TERM(0, row) + \
+		           COL_TERM(1, row) + \
+		           COL_TERM(2, row) + \
+		           COL_TERM(3, row)))
+
+		SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+		SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+		SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+		/* Actual interpolation. */
+		const T *data = (const T*)info.data;
+		return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+	}
+
+	static ccl_always_inline float4 interp_3d(const TextureInfo& info,
+	                                          float x, float y, float z,
+	                                          InterpolationType interp)
+	{
+		if(UNLIKELY(!info.data))
+			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+		switch((interp == INTERPOLATION_NONE)? info.interpolation: interp) {
+			case INTERPOLATION_CLOSEST:
+				return interp_3d_closest(info, x, y, z);
+			case INTERPOLATION_LINEAR:
+				return interp_3d_linear(info, x, y, z);
+			default:
+				return interp_3d_tricubic(info, x, y, z);
+		}
+	}
+#undef SET_CUBIC_SPLINE_WEIGHTS
+};
+
+ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
 {
-	if(tex >= TEX_START_HALF_CPU)
-		return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d(x, y, z);
-	else if(tex >= TEX_START_BYTE_CPU)
-		return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d(x, y, z);
-	else if(tex >= TEX_START_FLOAT_CPU)
-		return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d(x, y, z);
-	else if(tex >= TEX_START_HALF4_CPU)
-		return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d(x, y, z);
-	else if(tex >= TEX_START_BYTE4_CPU)
-		return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d(x, y, z);
-	else
-		return kg->texture_float4_images[tex].interp_3d(x, y, z);
+	const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
 
+	switch(kernel_tex_type(id)) {
+		case IMAGE_DATA_TYPE_HALF:
+			return TextureInterpolator<half>::interp(info, x, y);
+		case IMAGE_DATA_TYPE_BYTE:
+			return TextureInterpolator<uchar>::interp(info, x, y);
+		case IMAGE_DATA_TYPE_FLOAT:
+			return TextureInterpolator<float>::interp(info, x, y);
+		case IMAGE_DATA_TYPE_HALF4:
+			return TextureInterpolator<half4>::interp(info, x, y);
+		case IMAGE_DATA_TYPE_BYTE4:
+			return TextureInterpolator<uchar4>::interp(info, x, y);
+		case IMAGE_DATA_TYPE_FLOAT4:
+		default:
+			return TextureInterpolator<float4>::interp(info, x, y);
+	}
 }
 
-ccl_device float4 kernel_tex_image_interp_3d_ex_impl(KernelGlobals *kg, int tex, float x, float y, float z, int interpolation)
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp)
 {
-	if(tex >= TEX_START_HALF_CPU)
-		return kg->texture_half_images[tex - TEX_START_HALF_CPU].interp_3d_ex(x, y, z, interpolation);
-	else if(tex >= TEX_START_BYTE_CPU)
-		return kg->texture_byte_images[tex - TEX_START_BYTE_CPU].interp_3d_ex(x, y, z, interpolation);
-	else if(tex >= TEX_START_FLOAT_CPU)
-		return kg->texture_float_images[tex - TEX_START_FLOAT_CPU].interp_3d_ex(x, y, z, interpolation);
-	else if(tex >= TEX_START_HALF4_CPU)
-		return kg->texture_half4_images[tex - TEX_START_HALF4_CPU].interp_3d_ex(x, y, z, interpolation);
-	else if(tex >= TEX_START_BYTE4_CPU)
-		return kg->texture_byte4_images[tex - TEX_START_BYTE4_CPU].interp_3d_ex(x, y, z, interpolation);
-	else
-		return kg->texture_float4_images[tex].interp_3d_ex(x, y, z, interpolation);
+	const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
+
+	switch(kernel_tex_type(id)) {
+		case IMAGE_DATA_TYPE_HALF:
+			return TextureInterpolator<half>::interp_3d(info, x, y, z, interp);
+		case IMAGE_DATA_TYPE_BYTE:
+			return TextureInterpolator<uchar>::interp_3d(info, x, y, z, interp);
+		case IMAGE_DATA_TYPE_FLOAT:
+			return TextureInterpolator<float>::interp_3d(info, x, y, z, interp);
+		case IMAGE_DATA_TYPE_HALF4:
+			return TextureInterpolator<half4>::interp_3d(info, x, y, z, interp);
+		case IMAGE_DATA_TYPE_BYTE4:
+			return TextureInterpolator<uchar4>::interp_3d(info, x, y, z, interp);
+		case IMAGE_DATA_TYPE_FLOAT4:
+		default:
+			return TextureInterpolator<float4>::interp_3d(info, x, y, z, interp);
+	}
 }
 
 CCL_NAMESPACE_END
 
-#endif  // __KERNEL_CPU__
-
-
 #endif // __KERNEL_CPU_IMAGE_H__
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index ec82d4b4c22..fdeb7dcd3e4 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -20,43 +20,84 @@
  * simply includes this file without worry of copying actual implementation over.
  */
 
-#include "kernel_compat_cpu.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_cpu_image.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_path_branched.h"
-#include "kernel_bake.h"
+#include "kernel/kernel_compat_cpu.h"
+
+#ifndef KERNEL_STUB
+#  ifndef __SPLIT_KERNEL__
+#    include "kernel/kernel_math.h"
+#    include "kernel/kernel_types.h"
+
+#    include "kernel/split/kernel_split_data.h"
+#    include "kernel/kernel_globals.h"
+
+#    include "kernel/kernels/cpu/kernel_cpu_image.h"
+#    include "kernel/kernel_film.h"
+#    include "kernel/kernel_path.h"
+#    include "kernel/kernel_path_branched.h"
+#    include "kernel/kernel_bake.h"
+#  else
+#    include "kernel/split/kernel_split_common.h"
+
+#    include "kernel/split/kernel_data_init.h"
+#    include "kernel/split/kernel_path_init.h"
+#    include "kernel/split/kernel_scene_intersect.h"
+#    include "kernel/split/kernel_lamp_emission.h"
+#    include "kernel/split/kernel_do_volume.h"
+#    include "kernel/split/kernel_queue_enqueue.h"
+#    include "kernel/split/kernel_indirect_background.h"
+#    include "kernel/split/kernel_shader_setup.h"
+#    include "kernel/split/kernel_shader_sort.h"
+#    include "kernel/split/kernel_shader_eval.h"
+#    include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#    include "kernel/split/kernel_subsurface_scatter.h"
+#    include "kernel/split/kernel_direct_lighting.h"
+#    include "kernel/split/kernel_shadow_blocked_ao.h"
+#    include "kernel/split/kernel_shadow_blocked_dl.h"
+#    include "kernel/split/kernel_enqueue_inactive.h"
+#    include "kernel/split/kernel_next_iteration_setup.h"
+#    include "kernel/split/kernel_indirect_subsurface.h"
+#    include "kernel/split/kernel_buffer_update.h"
+#  endif  /* __SPLIT_KERNEL__ */
+#else
+#  include "util/util_debug.h"
+#  define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+
+#  ifdef __SPLIT_KERNEL__
+#    include "kernel/split/kernel_data_init.h"
+#  endif  /* __SPLIT_KERNEL__ */
+#endif  /* KERNEL_STUB */
 
 CCL_NAMESPACE_BEGIN
 
+#ifndef __SPLIT_KERNEL__
+
 /* Path Tracing */
 
 void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
                                            float *buffer,
-                                           unsigned int *rng_state,
                                            int sample,
                                            int x, int y,
                                            int offset,
                                            int stride)
 {
-#ifdef __BRANCHED_PATH__
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, path_trace);
+#else
+#  ifdef __BRANCHED_PATH__
 	if(kernel_data.integrator.branched) {
 		kernel_branched_path_trace(kg,
 		                           buffer,
-		                           rng_state,
 		                           sample,
 		                           x, y,
 		                           offset,
 		                           stride);
 	}
 	else
-#endif
+#  endif
 	{
-		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+		kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
 	}
+#endif /* KERNEL_STUB */
 }
 
 /* Film */
@@ -69,6 +110,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
                                                 int offset,
                                                 int stride)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
+#else
 	kernel_film_convert_to_byte(kg,
 	                            rgba,
 	                            buffer,
@@ -76,6 +120,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
 	                            x, y,
 	                            offset,
 	                            stride);
+#endif /* KERNEL_STUB */
 }
 
 void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -86,6 +131,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
                                                       int offset,
                                                       int stride)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
+#else
 	kernel_film_convert_to_half_float(kg,
 	                                  rgba,
 	                                  buffer,
@@ -93,6 +141,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 	                                  x, y,
 	                                  offset,
 	                                  stride);
+#endif /* KERNEL_STUB */
 }
 
 /* Shader Evaluate */
@@ -100,16 +149,17 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        uint4 *input,
                                        float4 *output,
-                                       float *output_luma,
                                        int type,
                                        int filter,
                                        int i,
                                        int offset,
                                        int sample)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, shader);
+#else
 	if(type >= SHADER_EVAL_BAKE) {
-		kernel_assert(output_luma == NULL);
-#ifdef __BAKING__
+#  ifdef __BAKING__
 		kernel_bake_evaluate(kg,
 		                     input,
 		                     output,
@@ -118,17 +168,70 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		                     i,
 		                     offset,
 		                     sample);
-#endif
+#  endif
+	}
+	else if(type == SHADER_EVAL_DISPLACE) {
+		kernel_displace_evaluate(kg, input, output, i);
 	}
 	else {
-		kernel_shader_evaluate(kg,
-		                       input,
-		                       output,
-		                       output_luma,
-		                       (ShaderEvalType)type,
-		                       i,
-		                       sample);
+		kernel_background_evaluate(kg, input, output, i);
 	}
+#endif /* KERNEL_STUB */
 }
 
+#else  /* __SPLIT_KERNEL__ */
+
+/* Split Kernel Path Tracing */
+
+#ifdef KERNEL_STUB
+#  define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		STUB_ASSERT(KERNEL_ARCH, name); \
+	}
+
+#  define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		STUB_ASSERT(KERNEL_ARCH, name); \
+	}
+#else
+#  define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		kernel_##name(kg); \
+	}
+
+#  define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		ccl_local type locals; \
+		kernel_##name(kg, &locals); \
+	}
+#endif /* KERNEL_STUB */
+
+DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+#endif  /* __SPLIT_KERNEL__ */
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
new file mode 100644
index 00000000000..ca750e5a00d
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE2__
+#endif
+
+#define __SPLIT_KERNEL__
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+#  ifdef __SSE2__
+#    ifndef __KERNEL_SSE2__
+#      define __KERNEL_SSE2__
+#    endif
+#  endif
+#  ifdef __SSE3__
+#    define __KERNEL_SSE3__
+#  endif
+#  ifdef __SSSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#  ifdef __SSE4_1__
+#    define __KERNEL_SSE41__
+#  endif
+#  ifdef __AVX__
+#    define __KERNEL_AVX__
+#  endif
+#  ifdef __AVX2__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX2__
+#  endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+    /* do nothing */
+#endif
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
new file mode 100644
index 00000000000..6ba3425a343
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
new file mode 100644
index 00000000000..76b2d77ebb8
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
new file mode 100644
index 00000000000..b468b6f44c8
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
new file mode 100644
index 00000000000..3e5792d0b17
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
new file mode 100644
index 00000000000..3629f21cd29
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index a5f2d6e7294..57530c88710 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -18,15 +18,17 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#endif
-
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  include "kernel.h"
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index 86f9ce991f8..c607753bc4b 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -18,17 +18,19 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#endif
-
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  include "kernel.h"
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index c174406047d..a278554731c 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -18,18 +18,20 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#endif
-
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  include "kernel.h"
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
new file mode 100644
index 00000000000..c8172355a7f
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -0,0 +1,251 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#include "kernel_config.h"
+
+#include "kernel/kernel_compat_cuda.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_divide_shadow(int sample,
+                                 TilesInfo *tiles,
+                                 float *unfilteredA,
+                                 float *unfilteredB,
+                                 float *sampleVariance,
+                                 float *sampleVarianceV,
+                                 float *bufferVariance,
+                                 int4 prefilter_rect,
+                                 int buffer_pass_stride,
+                                 int buffer_denoising_offset)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_divide_shadow(sample,
+		                            tiles,
+		                            x, y,
+		                            unfilteredA,
+		                            unfilteredB,
+		                            sampleVariance,
+		                            sampleVarianceV,
+		                            bufferVariance,
+		                            prefilter_rect,
+		                            buffer_pass_stride,
+		                            buffer_denoising_offset);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_get_feature(int sample,
+                               TilesInfo *tiles,
+                               int m_offset,
+                               int v_offset,
+                               float *mean,
+                               float *variance,
+                               int4 prefilter_rect,
+                               int buffer_pass_stride,
+                               int buffer_denoising_offset)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_get_feature(sample,
+		                          tiles,
+		                          m_offset, v_offset,
+		                          x, y,
+		                          mean, variance,
+		                          prefilter_rect,
+		                          buffer_pass_stride,
+		                          buffer_denoising_offset);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_detect_outliers(float *image,
+                                   float *variance,
+                                   float *depth,
+                                   float *output,
+                                   int4 prefilter_rect,
+                                   int pass_stride)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
+                                       float *transform, int *rank,
+                                       int4 filter_area, int4 rect,
+                                       int radius, float pca_threshold,
+                                       int pass_stride)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < filter_area.z && y < filter_area.w) {
+		int *l_rank = rank + y*filter_area.z + x;
+		float *l_transform = transform + y*filter_area.z + x;
+		kernel_filter_construct_transform(buffer,
+		                                  x + filter_area.x, y + filter_area.y,
+		                                  rect, pass_stride,
+		                                  l_transform, l_rank,
+		                                  radius, pca_threshold,
+		                                  filter_area.z*filter_area.w,
+		                                  threadIdx.y*blockDim.x + threadIdx.x);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
+                                       const float *ccl_restrict weight_image,
+                                       const float *ccl_restrict variance_image,
+                                       float *difference_image,
+                                       int4 rect, int w,
+                                       int channel_offset,
+                                       float a, float k_2)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_update_output(int dx, int dy,
+                                     const float *ccl_restrict difference_image,
+                                     const float *ccl_restrict image,
+                                     float *out_image, float *accum_image,
+                                     int4 rect, int w,
+                                     int f)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
+                                         const float *ccl_restrict difference_image,
+                                         const float *ccl_restrict buffer,
+                                         float const* __restrict__ transform,
+                                         int *rank,
+                                         float *XtWX,
+                                         float3 *XtWY,
+                                         int4 rect,
+                                         int4 filter_rect,
+                                         int w, int h, int f,
+                                         int pass_stride)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
+	int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
+	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+		kernel_filter_nlm_construct_gramian(x, y,
+		                                    dx, dy,
+		                                    difference_image,
+		                                    buffer,
+		                                    transform, rank,
+		                                    XtWX, XtWY,
+		                                    rect, filter_rect,
+		                                    w, h, f,
+		                                    pass_stride,
+		                                    threadIdx.y*blockDim.x + threadIdx.x);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_finalize(int w, int h,
+                            float *buffer, int *rank,
+                            float *XtWX, float3 *XtWY,
+                            int4 filter_area, int4 buffer_params,
+                            int sample)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < filter_area.z && y < filter_area.w) {
+		int storage_ofs = y*filter_area.z+x;
+		rank += storage_ofs;
+		XtWX += storage_ofs;
+		XtWY += storage_ofs;
+		kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+	}
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index eb2b6ea5414..3c93e00ccf1 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -16,134 +16,53 @@
 
 /* CUDA kernel entry points */
 
-#include "../../kernel_compat_cuda.h"
-#include "../../kernel_math.h"
-#include "../../kernel_types.h"
-#include "../../kernel_globals.h"
-#include "../../kernel_film.h"
-#include "../../kernel_path.h"
-#include "../../kernel_path_branched.h"
-#include "../../kernel_bake.h"
-
-/* device data taken from CUDA occupancy calculator */
-
 #ifdef __CUDA_ARCH__
 
-/* 2.0 and 2.1 */
-#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 32
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
-
-/* 3.0 and 3.5 */
-#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.0, 5.2, 5.3, 6.0, 6.1 */
-#elif __CUDA_ARCH__ >= 500
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 48
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* unknown architecture */
-#else
-#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
+#include "kernel/kernel_compat_cuda.h"
+#include "kernel_config.h"
 
-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
+#include "util/util_atomic.h"
 
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
-	__launch_bounds__( \
-		threads_block_width*threads_block_width, \
-		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
-		)
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-#  error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
-#  error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernels/cuda/kernel_cuda_image.h"
+#include "kernel/kernel_film.h"
+#include "kernel/kernel_path.h"
+#include "kernel/kernel_path_branched.h"
+#include "kernel/kernel_bake.h"
+#include "kernel/kernel_work_stealing.h"
 
 /* kernels */
-
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
 {
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+	int work_index = ccl_global_id(0);
 
-	if(x < sx + sw && y < sy + sh)
-		kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
+	if(work_index < total_work_size) {
+		uint x, y, sample;
+		get_work_pixel(tile, work_index, &x, &y, &sample);
+
+		KernelGlobals kg;
+		kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
+	}
 }
 
 #ifdef __BRANCHED_PATH__
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
-kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
 {
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+	int work_index = ccl_global_id(0);
+
+	if(work_index < total_work_size) {
+		uint x, y, sample;
+		get_work_pixel(tile, work_index, &x, &y, &sample);
 
-	if(x < sx + sw && y < sy + sh)
-		kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
+		KernelGlobals kg;
+		kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
+	}
 }
 #endif
 
@@ -154,8 +73,9 @@ kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
-	if(x < sx + sw && y < sy + sh)
+	if(x < sx + sw && y < sy + sh) {
 		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+	}
 }
 
 extern "C" __global__ void
@@ -165,31 +85,44 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
-	if(x < sx + sw && y < sy + sh)
+	if(x < sx + sw && y < sy + sh) {
 		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+	}
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_shader(uint4 *input,
-                   float4 *output,
-                   float *output_luma,
-                   int type,
-                   int sx,
-                   int sw,
-                   int offset,
-                   int sample)
+kernel_cuda_displace(uint4 *input,
+                     float4 *output,
+                     int type,
+                     int sx,
+                     int sw,
+                     int offset,
+                     int sample)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
 	if(x < sx + sw) {
-		kernel_shader_evaluate(NULL,
-		                       input,
-		                       output,
-		                       output_luma,
-		                       (ShaderEvalType)type, 
-		                       x,
-		                       sample);
+		KernelGlobals kg;
+		kernel_displace_evaluate(&kg, input, output, x);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_background(uint4 *input,
+                       float4 *output,
+                       int type,
+                       int sx,
+                       int sw,
+                       int offset,
+                       int sample)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+
+	if(x < sx + sw) {
+		KernelGlobals kg;
+		kernel_background_evaluate(&kg, input, output, x);
 	}
 }
 
@@ -200,8 +133,10 @@ kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
-	if(x < sx + sw)
-		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, filter, x, offset, sample);
+	if(x < sx + sw) {
+		KernelGlobals kg;
+		kernel_bake_evaluate(&kg, input, output, (ShaderEvalType)type, filter, x, offset, sample);
+	}
 }
 #endif
 
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
new file mode 100644
index 00000000000..7ae205b7e14
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* device data taken from CUDA occupancy calculator */
+
+/* 2.0 and 2.1 */
+#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 32
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* 3.0 and 3.5 */
+#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 5.0, 5.2, 5.3, 6.0, 6.1 */
+#elif __CUDA_ARCH__ >= 500
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 48
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* unknown architecture */
+#else
+#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* For split kernel using all registers seems fastest for now, but this
+ * is unlikely to be optimal once we resolve other bottlenecks. */
+
+#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
+	__launch_bounds__( \
+		threads_block_width*threads_block_width, \
+		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
+		)
+
+/* sanity checks */
+
+#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
+
+#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
new file mode 100644
index 00000000000..b7be4fe4409
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if __CUDA_ARCH__ >= 300
+
+/* Kepler */
+
+/* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */
+ccl_device float cubic_w0(float a)
+{
+	return (1.0f/6.0f)*(a*(a*(-a + 3.0f) - 3.0f) + 1.0f);
+}
+
+ccl_device float cubic_w1(float a)
+{
+	return (1.0f/6.0f)*(a*a*(3.0f*a - 6.0f) + 4.0f);
+}
+
+ccl_device float cubic_w2(float a)
+{
+	return (1.0f/6.0f)*(a*(a*(-3.0f*a + 3.0f) + 3.0f) + 1.0f);
+}
+
+ccl_device float cubic_w3(float a)
+{
+	return (1.0f/6.0f)*(a*a*a);
+}
+
+/* g0 and g1 are the two amplitude functions. */
+ccl_device float cubic_g0(float a)
+{
+	return cubic_w0(a) + cubic_w1(a);
+}
+
+ccl_device float cubic_g1(float a)
+{
+	return cubic_w2(a) + cubic_w3(a);
+}
+
+/* h0 and h1 are the two offset functions */
+ccl_device float cubic_h0(float a)
+{
+	/* Note +0.5 offset to compensate for CUDA linear filtering convention. */
+	return -1.0f + cubic_w1(a) / (cubic_w0(a) + cubic_w1(a)) + 0.5f;
+}
+
+ccl_device float cubic_h1(float a)
+{
+	return 1.0f + cubic_w3(a) / (cubic_w2(a) + cubic_w3(a)) + 0.5f;
+}
+
+/* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
+template<typename T>
+ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo& info, CUtexObject tex, float x, float y)
+{
+	x = (x * info.width) - 0.5f;
+	y = (y * info.height) - 0.5f;
+
+	float px = floor(x);
+	float py = floor(y);
+	float fx = x - px;
+	float fy = y - py;
+
+	float g0x = cubic_g0(fx);
+	float g1x = cubic_g1(fx);
+	float x0 = (px + cubic_h0(fx)) / info.width;
+	float x1 = (px + cubic_h1(fx)) / info.width;
+	float y0 = (py + cubic_h0(fy)) / info.height;
+	float y1 = (py + cubic_h1(fy)) / info.height;
+
+	return cubic_g0(fy) * (g0x * tex2D<T>(tex, x0, y0) +
+	                       g1x * tex2D<T>(tex, x1, y0)) +
+	       cubic_g1(fy) * (g0x * tex2D<T>(tex, x0, y1) +
+	                       g1x * tex2D<T>(tex, x1, y1));
+}
+
+/* Fast tricubic texture lookup using 8 bilinear lookups. */
+template<typename T>
+ccl_device T kernel_tex_image_interp_bicubic_3d(const TextureInfo& info, CUtexObject tex, float x, float y, float z)
+{
+	x = (x * info.width) - 0.5f;
+	y = (y * info.height) - 0.5f;
+	z = (z * info.depth) - 0.5f;
+
+	float px = floor(x);
+	float py = floor(y);
+	float pz = floor(z);
+	float fx = x - px;
+	float fy = y - py;
+	float fz = z - pz;
+
+	float g0x = cubic_g0(fx);
+	float g1x = cubic_g1(fx);
+	float g0y = cubic_g0(fy);
+	float g1y = cubic_g1(fy);
+	float g0z = cubic_g0(fz);
+	float g1z = cubic_g1(fz);
+
+	float x0 = (px + cubic_h0(fx)) / info.width;
+	float x1 = (px + cubic_h1(fx)) / info.width;
+	float y0 = (py + cubic_h0(fy)) / info.height;
+	float y1 = (py + cubic_h1(fy)) / info.height;
+	float z0 = (pz + cubic_h0(fz)) / info.depth;
+	float z1 = (pz + cubic_h1(fz)) / info.depth;
+
+	return g0z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z0) +
+	                     g1x * tex3D<T>(tex, x1, y0, z0)) +
+	              g1y * (g0x * tex3D<T>(tex, x0, y1, z0) +
+	                     g1x * tex3D<T>(tex, x1, y1, z0))) +
+	       g1z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z1) +
+	                     g1x * tex3D<T>(tex, x1, y0, z1)) +
+	              g1y * (g0x * tex3D<T>(tex, x0, y1, z1) +
+	                     g1x * tex3D<T>(tex, x1, y1, z1)));
+}
+
+ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+{
+	const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
+	CUtexObject tex = (CUtexObject)info.data;
+
+	/* float4, byte4 and half4 */
+	const int texture_type = kernel_tex_type(id);
+	if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
+	   texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+	   texture_type == IMAGE_DATA_TYPE_HALF4)
+	{
+		if(info.interpolation == INTERPOLATION_CUBIC) {
+			return kernel_tex_image_interp_bicubic<float4>(info, tex, x, y);
+		}
+		else {
+			return tex2D<float4>(tex, x, y);
+		}
+	}
+	/* float, byte and half */
+	else {
+		float f;
+
+		if(info.interpolation == INTERPOLATION_CUBIC) {
+			f = kernel_tex_image_interp_bicubic<float>(info, tex, x, y);
+		}
+		else {
+			f = tex2D<float>(tex, x, y);
+		}
+
+		return make_float4(f, f, f, 1.0f);
+	}
+}
+
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp)
+{
+	const TextureInfo& info = kernel_tex_fetch(__texture_info, id);
+	CUtexObject tex = (CUtexObject)info.data;
+	uint interpolation = (interp == INTERPOLATION_NONE)? info.interpolation: interp;
+
+	const int texture_type = kernel_tex_type(id);
+	if(texture_type == IMAGE_DATA_TYPE_FLOAT4 ||
+	   texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+	   texture_type == IMAGE_DATA_TYPE_HALF4)
+	{
+		if(interpolation == INTERPOLATION_CUBIC) {
+			return kernel_tex_image_interp_bicubic_3d<float4>(info, tex, x, y, z);
+		}
+		else {
+			return tex3D<float4>(tex, x, y, z);
+		}
+	}
+	else {
+		float f;
+
+		if(interpolation == INTERPOLATION_CUBIC) {
+			f = kernel_tex_image_interp_bicubic_3d<float>(info, tex, x, y, z);
+		}
+		else {
+			f = tex3D<float>(tex, x, y, z);
+		}
+
+		return make_float4(f, f, f, 1.0f);
+	}
+}
+
+#else
+
+/* Fermi */
+
+ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+{
+	float4 r;
+	switch(id) {
+		case 0: r = tex2D(__tex_image_float4_000, x, y); break;
+		case 8: r = tex2D(__tex_image_float4_008, x, y); break;
+		case 16: r = tex2D(__tex_image_float4_016, x, y); break;
+		case 24: r = tex2D(__tex_image_float4_024, x, y); break;
+		case 32: r = tex2D(__tex_image_float4_032, x, y); break;
+		case 1: r = tex2D(__tex_image_byte4_001, x, y); break;
+		case 9: r = tex2D(__tex_image_byte4_009, x, y); break;
+		case 17: r = tex2D(__tex_image_byte4_017, x, y); break;
+		case 25: r = tex2D(__tex_image_byte4_025, x, y); break;
+		case 33: r = tex2D(__tex_image_byte4_033, x, y); break;
+		case 41: r = tex2D(__tex_image_byte4_041, x, y); break;
+		case 49: r = tex2D(__tex_image_byte4_049, x, y); break;
+		case 57: r = tex2D(__tex_image_byte4_057, x, y); break;
+		case 65: r = tex2D(__tex_image_byte4_065, x, y); break;
+		case 73: r = tex2D(__tex_image_byte4_073, x, y); break;
+		case 81: r = tex2D(__tex_image_byte4_081, x, y); break;
+		case 89: r = tex2D(__tex_image_byte4_089, x, y); break;
+		case 97: r = tex2D(__tex_image_byte4_097, x, y); break;
+		case 105: r = tex2D(__tex_image_byte4_105, x, y); break;
+		case 113: r = tex2D(__tex_image_byte4_113, x, y); break;
+		case 121: r = tex2D(__tex_image_byte4_121, x, y); break;
+		case 129: r = tex2D(__tex_image_byte4_129, x, y); break;
+		case 137: r = tex2D(__tex_image_byte4_137, x, y); break;
+		case 145: r = tex2D(__tex_image_byte4_145, x, y); break;
+		case 153: r = tex2D(__tex_image_byte4_153, x, y); break;
+		case 161: r = tex2D(__tex_image_byte4_161, x, y); break;
+		case 169: r = tex2D(__tex_image_byte4_169, x, y); break;
+		case 177: r = tex2D(__tex_image_byte4_177, x, y); break;
+		case 185: r = tex2D(__tex_image_byte4_185, x, y); break;
+		case 193: r = tex2D(__tex_image_byte4_193, x, y); break;
+		case 201: r = tex2D(__tex_image_byte4_201, x, y); break;
+		case 209: r = tex2D(__tex_image_byte4_209, x, y); break;
+		case 217: r = tex2D(__tex_image_byte4_217, x, y); break;
+		case 225: r = tex2D(__tex_image_byte4_225, x, y); break;
+		case 233: r = tex2D(__tex_image_byte4_233, x, y); break;
+		case 241: r = tex2D(__tex_image_byte4_241, x, y); break;
+		case 249: r = tex2D(__tex_image_byte4_249, x, y); break;
+		case 257: r = tex2D(__tex_image_byte4_257, x, y); break;
+		case 265: r = tex2D(__tex_image_byte4_265, x, y); break;
+		case 273: r = tex2D(__tex_image_byte4_273, x, y); break;
+		case 281: r = tex2D(__tex_image_byte4_281, x, y); break;
+		case 289: r = tex2D(__tex_image_byte4_289, x, y); break;
+		case 297: r = tex2D(__tex_image_byte4_297, x, y); break;
+		case 305: r = tex2D(__tex_image_byte4_305, x, y); break;
+		case 313: r = tex2D(__tex_image_byte4_313, x, y); break;
+		case 321: r = tex2D(__tex_image_byte4_321, x, y); break;
+		case 329: r = tex2D(__tex_image_byte4_329, x, y); break;
+		case 337: r = tex2D(__tex_image_byte4_337, x, y); break;
+		case 345: r = tex2D(__tex_image_byte4_345, x, y); break;
+		case 353: r = tex2D(__tex_image_byte4_353, x, y); break;
+		case 361: r = tex2D(__tex_image_byte4_361, x, y); break;
+		case 369: r = tex2D(__tex_image_byte4_369, x, y); break;
+		case 377: r = tex2D(__tex_image_byte4_377, x, y); break;
+		case 385: r = tex2D(__tex_image_byte4_385, x, y); break;
+		case 393: r = tex2D(__tex_image_byte4_393, x, y); break;
+		case 401: r = tex2D(__tex_image_byte4_401, x, y); break;
+		case 409: r = tex2D(__tex_image_byte4_409, x, y); break;
+		case 417: r = tex2D(__tex_image_byte4_417, x, y); break;
+		case 425: r = tex2D(__tex_image_byte4_425, x, y); break;
+		case 433: r = tex2D(__tex_image_byte4_433, x, y); break;
+		case 441: r = tex2D(__tex_image_byte4_441, x, y); break;
+		case 449: r = tex2D(__tex_image_byte4_449, x, y); break;
+		case 457: r = tex2D(__tex_image_byte4_457, x, y); break;
+		case 465: r = tex2D(__tex_image_byte4_465, x, y); break;
+		case 473: r = tex2D(__tex_image_byte4_473, x, y); break;
+		case 481: r = tex2D(__tex_image_byte4_481, x, y); break;
+		case 489: r = tex2D(__tex_image_byte4_489, x, y); break;
+		case 497: r = tex2D(__tex_image_byte4_497, x, y); break;
+		case 505: r = tex2D(__tex_image_byte4_505, x, y); break;
+		case 513: r = tex2D(__tex_image_byte4_513, x, y); break;
+		case 521: r = tex2D(__tex_image_byte4_521, x, y); break;
+		case 529: r = tex2D(__tex_image_byte4_529, x, y); break;
+		case 537: r = tex2D(__tex_image_byte4_537, x, y); break;
+		case 545: r = tex2D(__tex_image_byte4_545, x, y); break;
+		case 553: r = tex2D(__tex_image_byte4_553, x, y); break;
+		case 561: r = tex2D(__tex_image_byte4_561, x, y); break;
+		case 569: r = tex2D(__tex_image_byte4_569, x, y); break;
+		case 577: r = tex2D(__tex_image_byte4_577, x, y); break;
+		case 585: r = tex2D(__tex_image_byte4_585, x, y); break;
+		case 593: r = tex2D(__tex_image_byte4_593, x, y); break;
+		case 601: r = tex2D(__tex_image_byte4_601, x, y); break;
+		case 609: r = tex2D(__tex_image_byte4_609, x, y); break;
+		case 617: r = tex2D(__tex_image_byte4_617, x, y); break;
+		case 625: r = tex2D(__tex_image_byte4_625, x, y); break;
+		case 633: r = tex2D(__tex_image_byte4_633, x, y); break;
+		case 641: r = tex2D(__tex_image_byte4_641, x, y); break;
+		case 649: r = tex2D(__tex_image_byte4_649, x, y); break;
+		case 657: r = tex2D(__tex_image_byte4_657, x, y); break;
+		case 665: r = tex2D(__tex_image_byte4_665, x, y); break;
+		default: r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	}
+	return r;
+}
+
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, InterpolationType interp)
+{
+	float4 r;
+	switch(id) {
+		case 0: r = tex3D(__tex_image_float4_3d_000, x, y, z); break;
+		case 8: r = tex3D(__tex_image_float4_3d_008, x, y, z); break;
+		case 16: r = tex3D(__tex_image_float4_3d_016, x, y, z); break;
+		case 24: r = tex3D(__tex_image_float4_3d_024, x, y, z); break;
+		case 32: r = tex3D(__tex_image_float4_3d_032, x, y, z); break;
+	}
+	return r;
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
new file mode 100644
index 00000000000..43b3d0aa0e6
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA split kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#define __SPLIT_KERNEL__
+
+#include "kernel/kernel_compat_cuda.h"
+#include "kernel_config.h"
+
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_data_init.h"
+#include "kernel/split/kernel_path_init.h"
+#include "kernel/split/kernel_scene_intersect.h"
+#include "kernel/split/kernel_lamp_emission.h"
+#include "kernel/split/kernel_do_volume.h"
+#include "kernel/split/kernel_queue_enqueue.h"
+#include "kernel/split/kernel_indirect_background.h"
+#include "kernel/split/kernel_shader_setup.h"
+#include "kernel/split/kernel_shader_sort.h"
+#include "kernel/split/kernel_shader_eval.h"
+#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#include "kernel/split/kernel_subsurface_scatter.h"
+#include "kernel/split/kernel_direct_lighting.h"
+#include "kernel/split/kernel_shadow_blocked_ao.h"
+#include "kernel/split/kernel_shadow_blocked_dl.h"
+#include "kernel/split/kernel_enqueue_inactive.h"
+#include "kernel/split/kernel_next_iteration_setup.h"
+#include "kernel/split/kernel_indirect_subsurface.h"
+#include "kernel/split/kernel_buffer_update.h"
+
+#include "kernel/kernel_film.h"
+
+/* kernels */
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size)
+{
+	*size = split_data_buffer_size(NULL, num_threads);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_path_trace_data_init(
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
+        ccl_global int *Queue_index,
+        int queuesize,
+        ccl_global char *use_queues_flag,
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+        ccl_global float *buffer)
+{
+	kernel_data_init(NULL,
+	                 NULL,
+	                 split_data_buffer,
+	                 num_elements,
+	                 ray_state,
+	                 start_sample,
+	                 end_sample,
+	                 sx, sy, sw, sh, offset, stride,
+	                 Queue_index,
+	                 queuesize,
+	                 use_queues_flag,
+	                 work_pool_wgs,
+	                 num_samples,
+	                 buffer);
+}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	extern "C" __global__ void \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
+	kernel_cuda_##name() \
+	{ \
+		kernel_##name(NULL); \
+	}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+	extern "C" __global__ void \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
+	kernel_cuda_##name() \
+	{ \
+		ccl_local type locals; \
+		kernel_##name(NULL, &locals); \
+	}
+
+DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
new file mode 100644
index 00000000000..7a7b596a350
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -0,0 +1,276 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* OpenCL kernel entry points */
+
+#include "kernel/kernel_compat_opencl.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+__kernel void kernel_ocl_filter_divide_shadow(int sample,
+                                              ccl_global TilesInfo *tiles,
+                                              ccl_global float *unfilteredA,
+                                              ccl_global float *unfilteredB,
+                                              ccl_global float *sampleVariance,
+                                              ccl_global float *sampleVarianceV,
+                                              ccl_global float *bufferVariance,
+                                              int4 prefilter_rect,
+                                              int buffer_pass_stride,
+                                              int buffer_denoising_offset)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_divide_shadow(sample,
+		                            tiles,
+		                            x, y,
+		                            unfilteredA,
+		                            unfilteredB,
+		                            sampleVariance,
+		                            sampleVarianceV,
+		                            bufferVariance,
+		                            prefilter_rect,
+		                            buffer_pass_stride,
+		                            buffer_denoising_offset);
+	}
+}
+
+__kernel void kernel_ocl_filter_get_feature(int sample,
+                                            ccl_global TilesInfo *tiles,
+                                            int m_offset,
+                                            int v_offset,
+                                            ccl_global float *mean,
+                                            ccl_global float *variance,
+                                            int4 prefilter_rect,
+                                            int buffer_pass_stride,
+                                            int buffer_denoising_offset)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_get_feature(sample,
+		                          tiles,
+		                          m_offset, v_offset,
+		                          x, y,
+		                          mean, variance,
+		                          prefilter_rect,
+		                          buffer_pass_stride,
+		                          buffer_denoising_offset);
+	}
+}
+
+__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
+                                                ccl_global float *variance,
+                                                ccl_global float *depth,
+                                                ccl_global float *output,
+                                                int4 prefilter_rect,
+                                                int pass_stride)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
+	}
+}
+
+__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
+                                               ccl_global float *variance,
+                                               ccl_global float *a,
+                                               ccl_global float *b,
+                                               int4 prefilter_rect,
+                                               int r)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+	}
+}
+
+__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
+                                                    ccl_global float *transform,
+                                                    ccl_global int *rank,
+                                                    int4 filter_area,
+                                                    int4 rect,
+                                                    int pass_stride,
+                                                    int radius,
+                                                    float pca_threshold)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	if(x < filter_area.z && y < filter_area.w) {
+		ccl_global int *l_rank = rank + y*filter_area.z + x;
+		ccl_global float *l_transform = transform + y*filter_area.z + x;
+		kernel_filter_construct_transform(buffer,
+		                                  x + filter_area.x, y + filter_area.y,
+		                                  rect, pass_stride,
+		                                  l_transform, l_rank,
+		                                  radius, pca_threshold,
+		                                  filter_area.z*filter_area.w,
+		                                  get_local_id(1)*get_local_size(0) + get_local_id(0));
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
+                                                    int dy,
+                                                    const ccl_global float *ccl_restrict weight_image,
+                                                    const ccl_global float *ccl_restrict variance_image,
+                                                    ccl_global float *difference_image,
+                                                    int4 rect,
+                                                    int w,
+                                                    int channel_offset,
+                                                    float a,
+                                                    float k_2)
+{
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
+                                         ccl_global float *out_image,
+                                         int4 rect,
+                                         int w,
+                                         int f)
+{
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
+                                                ccl_global float *out_image,
+                                                int4 rect,
+                                                int w,
+                                                int f)
+{
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_update_output(int dx,
+                                                  int dy,
+                                                  const ccl_global float *ccl_restrict difference_image,
+                                                  const ccl_global float *ccl_restrict image,
+                                                  ccl_global float *out_image,
+                                                  ccl_global float *accum_image,
+                                                  int4 rect,
+                                                  int w,
+                                                  int f)
+{
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
+                                              const ccl_global float *ccl_restrict accum_image,
+                                              int4 rect,
+                                              int w)
+{
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
+                                                      int dy,
+                                                      const ccl_global float *ccl_restrict difference_image,
+                                                      const ccl_global float *ccl_restrict buffer,
+                                                      const ccl_global float *ccl_restrict transform,
+                                                      ccl_global int *rank,
+                                                      ccl_global float *XtWX,
+                                                      ccl_global float3 *XtWY,
+                                                      int4 rect,
+                                                      int4 filter_rect,
+                                                      int w,
+                                                      int h,
+                                                      int f,
+                                                      int pass_stride)
+{
+	int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
+	int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
+	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+		kernel_filter_nlm_construct_gramian(x, y,
+		                                    dx, dy,
+		                                    difference_image,
+		                                    buffer,
+		                                    transform, rank,
+		                                    XtWX, XtWY,
+		                                    rect, filter_rect,
+		                                    w, h, f,
+		                                    pass_stride,
+		                                    get_local_id(1)*get_local_size(0) + get_local_id(0));
+	}
+}
+
+__kernel void kernel_ocl_filter_finalize(int w,
+                                         int h,
+                                         ccl_global float *buffer,
+                                         ccl_global int *rank,
+                                         ccl_global float *XtWX,
+                                         ccl_global float3 *XtWY,
+                                         int4 filter_area,
+                                         int4 buffer_params,
+                                         int sample)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	if(x < filter_area.z && y < filter_area.w) {
+		int storage_ofs = y*filter_area.z+x;
+		rank += storage_ofs;
+		XtWX += storage_ofs;
+		XtWY += storage_ofs;
+		kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+	}
+}
+
+__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
+                                          ccl_global float *buffer_1,
+                                          ccl_global float *buffer_2,
+                                          ccl_global float *buffer_3,
+                                          ccl_global float *buffer_4,
+                                          ccl_global float *buffer_5,
+                                          ccl_global float *buffer_6,
+                                          ccl_global float *buffer_7,
+                                          ccl_global float *buffer_8,
+                                          ccl_global float *buffer_9)
+{
+	if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
+		tiles->buffers[0] = buffer_1;
+		tiles->buffers[1] = buffer_2;
+		tiles->buffers[2] = buffer_3;
+		tiles->buffers[3] = buffer_4;
+		tiles->buffers[4] = buffer_5;
+		tiles->buffers[5] = buffer_6;
+		tiles->buffers[6] = buffer_7;
+		tiles->buffers[7] = buffer_8;
+		tiles->buffers[8] = buffer_9;
+	}
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index a68f97857b6..9d5d784e140 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -16,45 +16,42 @@
 
 /* OpenCL kernel entry points - unfinished */
 
-#include "../../kernel_compat_opencl.h"
-#include "../../kernel_math.h"
-#include "../../kernel_types.h"
-#include "../../kernel_globals.h"
-#include "../../kernel_image_opencl.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernels/opencl/kernel_opencl_image.h"
 
-#include "../../kernel_film.h"
+#include "kernel/kernel_film.h"
 
 #if defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__)
-#  include "../../kernel_path.h"
-#  include "../../kernel_path_branched.h"
+#  include "kernel/kernel_path.h"
+#  include "kernel/kernel_path_branched.h"
 #else  /* __COMPILE_ONLY_MEGAKERNEL__ */
 /* Include only actually used headers for the case
  * when path tracing kernels are not needed.
  */
-#  include "../../kernel_random.h"
-#  include "../../kernel_differential.h"
-#  include "../../kernel_montecarlo.h"
-#  include "../../kernel_projection.h"
-#  include "../../geom/geom.h"
-#  include "../../bvh/bvh.h"
-
-#  include "../../kernel_accumulate.h"
-#  include "../../kernel_camera.h"
-#  include "../../kernel_shader.h"
+#  include "kernel/kernel_random.h"
+#  include "kernel/kernel_differential.h"
+#  include "kernel/kernel_montecarlo.h"
+#  include "kernel/kernel_projection.h"
+#  include "kernel/geom/geom.h"
+#  include "kernel/bvh/bvh.h"
+
+#  include "kernel/kernel_accumulate.h"
+#  include "kernel/kernel_camera.h"
+#  include "kernel/kernel_shader.h"
 #endif  /* defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__) */
 
-#include "../../kernel_bake.h"
+#include "kernel/kernel_bake.h"
 
 #ifdef __COMPILE_ONLY_MEGAKERNEL__
 
 __kernel void kernel_ocl_path_trace(
 	ccl_constant KernelData *data,
 	ccl_global float *buffer,
-	ccl_global uint *rng_state,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "../../kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	int sample,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -63,28 +60,24 @@ __kernel void kernel_ocl_path_trace(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "../../kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
-		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
+		kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
 }
 
 #else  /* __COMPILE_ONLY_MEGAKERNEL__ */
 
-__kernel void kernel_ocl_shader(
+__kernel void kernel_ocl_displace(
 	ccl_constant KernelData *data,
 	ccl_global uint4 *input,
 	ccl_global float4 *output,
-	ccl_global float *output_luma,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "../../kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	int type, int sx, int sw, int offset, int sample)
 {
@@ -92,20 +85,35 @@ __kernel void kernel_ocl_shader(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "../../kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
-	int x = sx + get_global_id(0);
+	int x = sx + ccl_global_id(0);
 
 	if(x < sx + sw) {
-		kernel_shader_evaluate(kg,
-		                       input,
-		                       output,
-		                       output_luma,
-		                       (ShaderEvalType)type,
-		                       x,
-		                       sample);
+		kernel_displace_evaluate(kg, input, output, x);
+	}
+}
+__kernel void kernel_ocl_background(
+	ccl_constant KernelData *data,
+	ccl_global uint4 *input,
+	ccl_global float4 *output,
+
+	KERNEL_BUFFER_PARAMS,
+
+	int type, int sx, int sw, int offset, int sample)
+{
+	KernelGlobals kglobals, *kg = &kglobals;
+
+	kg->data = data;
+
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
+
+	int x = sx + ccl_global_id(0);
+
+	if(x < sx + sw) {
+		kernel_background_evaluate(kg, input, output, x);
 	}
 }
 
@@ -114,9 +122,7 @@ __kernel void kernel_ocl_bake(
 	ccl_global uint4 *input,
 	ccl_global float4 *output,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "../../kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	int type, int filter, int sx, int sw, int offset, int sample)
 {
@@ -124,11 +130,10 @@ __kernel void kernel_ocl_bake(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "../../kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
-	int x = sx + get_global_id(0);
+	int x = sx + ccl_global_id(0);
 
 	if(x < sx + sw) {
 #ifdef __NO_BAKING__
@@ -144,9 +149,7 @@ __kernel void kernel_ocl_convert_to_byte(
 	ccl_global uchar4 *rgba,
 	ccl_global float *buffer,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "../../kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	float sample_scale,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -155,12 +158,11 @@ __kernel void kernel_ocl_convert_to_byte(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "../../kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
 		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
@@ -171,9 +173,7 @@ __kernel void kernel_ocl_convert_to_half_float(
 	ccl_global uchar4 *rgba,
 	ccl_global float *buffer,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "../../kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	float sample_scale,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -182,15 +182,30 @@ __kernel void kernel_ocl_convert_to_half_float(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "../../kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
 		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
+__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset)
+{
+	size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
+
+	if(i < size / sizeof(float4)) {
+		buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	}
+	else if(i == size / sizeof(float4)) {
+		ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)];
+
+		for(i = 0; i < size % sizeof(float4); i++) {
+			*(b++) = 0;
+		}
+	}
+}
+
 #endif  /* __COMPILE_ONLY_MEGAKERNEL__ */
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
deleted file mode 100644
index 1914d241eb1..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "split/kernel_background_buffer_update.h"
-
-__kernel void kernel_ocl_path_trace_background_buffer_update(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,             /* Required for buffer Update */
-        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
-        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
-        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
-        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
-        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
-        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
-        int sw, int sh, int sx, int sy, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
-        ccl_global int *Queue_data,            /* Queues memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize,                         /* Size (capacity) of each queue */
-        int end_sample,
-        int start_sample,
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
-{
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	if(ray_index == 0) {
-		/* We will empty this queue in this kernel. */
-		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-	}
-	char enqueue_flag = 0;
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          1);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag =
-			kernel_background_buffer_update((KernelGlobals *)kg,
-			                                per_sample_output_buffers,
-			                                rng_state,
-			                                rng_coop,
-			                                throughput_coop,
-			                                PathRadiance_coop,
-			                                Ray_coop,
-			                                PathState_coop,
-			                                L_transparent_coop,
-			                                ray_state,
-			                                sw, sh, sx, sy, stride,
-			                                rng_state_offset_x,
-			                                rng_state_offset_y,
-			                                rng_state_stride,
-			                                work_array,
-			                                end_sample,
-			                                start_sample,
-#ifdef __WORK_STEALING__
-			                                work_pool_wgs,
-			                                num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-			                                debugdata_coop,
-#endif
-			                                parallel_samples,
-			                                ray_index);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-	 * These rays will be made active during next SceneIntersectkernel.
-	 */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
new file mode 100644
index 00000000000..dcea2630aef
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_buffer_update.h"
+
+#define KERNEL_NAME buffer_update
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
index 18139687eab..7125348a49f 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -14,77 +14,40 @@
  * limitations under the License.
  */
 
-#include "split/kernel_data_init.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_data_init.h"
 
 __kernel void kernel_ocl_path_trace_data_init(
-        ccl_global char *globals,
-        ccl_global char *sd_DL_shadow,
+        ccl_global char *kg,
         ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
-        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
-        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
-        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
-        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
-        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
-        Intersection *Intersection_coop_shadow,
-        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
-
-#define KERNEL_TEX(type, ttype, name)                                   \
-        ccl_global type *name,
-#include "../../kernel_textures.h"
-
-        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global int *Queue_data,                  /* Memory for queues */
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+		KERNEL_BUFFER_PARAMS,
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
         ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
         int queuesize,                               /* size (capacity) of the queue */
         ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
-#ifdef __WORK_STEALING__
         ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
         unsigned int num_samples,                    /* Total number of samples per pixel */
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                        /* Number of samples to be processed in parallel */
+        ccl_global float *buffer)
 {
-	kernel_data_init((KernelGlobals *)globals,
-	                 (ShaderData *)sd_DL_shadow,
+	kernel_data_init((KernelGlobals*)kg,
 	                 data,
-	                 per_sample_output_buffers,
-	                 rng_state,
-	                 rng_coop,
-	                 throughput_coop,
-	                 L_transparent_coop,
-	                 PathRadiance_coop,
-	                 Ray_coop,
-	                 PathState_coop,
-	                 Intersection_coop_shadow,
+	                 split_data_buffer,
+	                 num_elements,
 	                 ray_state,
-
-#define KERNEL_TEX(type, ttype, name) name,
-#include "../../kernel_textures.h"
-
-	                 start_sample, sx, sy, sw, sh, offset, stride,
-	                 rng_state_offset_x,
-	                 rng_state_offset_y,
-	                 rng_state_stride,
-	                 Queue_data,
+	                 KERNEL_BUFFER_ARGS,
+	                 start_sample,
+	                 end_sample,
+	                 sx, sy, sw, sh, offset, stride,
 	                 Queue_index,
 	                 queuesize,
 	                 use_queues_flag,
-	                 work_array,
-#ifdef __WORK_STEALING__
 	                 work_pool_wgs,
 	                 num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-	                 debugdata_coop,
-#endif
-	                 parallel_samples);
+	                 buffer);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
index c6a2c8d050c..ed64ae01aae 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -14,74 +14,13 @@
  * limitations under the License.
  */
 
-#include "split/kernel_direct_lighting.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_direct_lighting.h"
 
-__kernel void kernel_ocl_path_trace_direct_lighting(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                    /* Required for direct lighting */
-        ccl_global uint *rng_coop,              /* Required for direct lighting */
-        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
-        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
-        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
-        ccl_global char *ray_state,             /* Denotes the state of each ray */
-        ccl_global int *Queue_data,             /* Queue memory */
-        ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
-        int queuesize)                          /* Size (capacity) of each queue */
-{
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+#define KERNEL_NAME direct_lighting
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
 
-	char enqueue_flag = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag = kernel_direct_lighting((KernelGlobals *)kg,
-		                                      (ShaderData *)sd,
-		                                      rng_coop,
-		                                      PathState_coop,
-		                                      ISLamp_coop,
-		                                      LightRay_coop,
-		                                      BSDFEval_coop,
-		                                      ray_state,
-		                                      ray_index);
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-#ifdef __EMISSION__
-	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-#endif
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
new file mode 100644
index 00000000000..8afaa686e28
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_do_volume.h"
+
+#define KERNEL_NAME do_volume
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
new file mode 100644
index 00000000000..e68d4104a91
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_enqueue_inactive.h"
+
+#define KERNEL_NAME enqueue_inactive
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
index e063614da1a..9e1e57beba6 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -14,110 +14,13 @@
  * limitations under the License.
  */
 
-#include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
 
-__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                   /* Required throughout the kernel except probabilistic path termination and AO */
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
-        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
-        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
-        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
-        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
-        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
-        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
-        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
-        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
-        int sw, int sh, int sx, int sy, int stride,
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
-        ccl_global int *Queue_data,            /* Queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize,                         /* Size (capacity) of each queue */
-#ifdef __WORK_STEALING__
-        unsigned int start_sample,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
-{
-	ccl_local unsigned int local_queue_atomics_bg;
-	ccl_local unsigned int local_queue_atomics_ao;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics_bg = 0;
-		local_queue_atomics_ao = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
+#define LOCALS_TYPE BackgroundAOLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
 
-	char enqueue_flag = 0;
-	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif  /* __COMPUTE_DEVICE_GPU__ */
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		kernel_holdout_emission_blurring_pathtermination_ao(
-		        (KernelGlobals *)kg,
-		        (ShaderData *)sd,
-		        per_sample_output_buffers,
-		        rng_coop,
-		        throughput_coop,
-		        L_transparent_coop,
-		        PathRadiance_coop,
-		        PathState_coop,
-		        Intersection_coop,
-		        AOAlpha_coop,
-		        AOBSDF_coop,
-		        AOLightRay_coop,
-		        sw, sh, sx, sy, stride,
-		        ray_state,
-		        work_array,
-#ifdef __WORK_STEALING__
-		        start_sample,
-#endif
-		        parallel_samples,
-		        ray_index,
-		        &enqueue_flag,
-		        &enqueue_flag_AO_SHADOW_RAY_CAST);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics_bg,
-	                        Queue_data,
-	                        Queue_index);
-
-#ifdef __AO__
-	/* Enqueue to-shadow-ray-cast rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-	                        enqueue_flag_AO_SHADOW_RAY_CAST,
-	                        queuesize,
-	                        &local_queue_atomics_ao,
-	                        Queue_data,
-	                        Queue_index);
-#endif
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
new file mode 100644
index 00000000000..192d01444ba
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_indirect_background.h"
+
+#define KERNEL_NAME indirect_background
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
new file mode 100644
index 00000000000..84938b889e5
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_indirect_subsurface.h"
+
+#define KERNEL_NAME indirect_subsurface
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
index 267bddc2ffc..c314dc96c33 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -14,67 +14,11 @@
  * limitations under the License.
  */
 
-#include "split/kernel_lamp_emission.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_lamp_emission.h"
 
-__kernel void kernel_ocl_path_trace_lamp_emission(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
-        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
-        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
-        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
-        Intersection *Intersection_coop,       /* Required for lamp emission */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global int *Queue_data,            /* Memory for queues */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
-        int queuesize,                         /* Size (capacity) of queues */
-        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
-                                                * queues to fetch ray index
-                                                */
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
+#define KERNEL_NAME lamp_emission
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
 
-	/* We will empty this queue in this kernel. */
-	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
-		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-	}
-	/* Fetch use_queues_flag. */
-	ccl_local char local_use_queues_flag;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_use_queues_flag = use_queues_flag[0];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index;
-	if(local_use_queues_flag) {
-		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-		ray_index = get_ray_index(thread_index,
-		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                          Queue_data,
-		                          queuesize,
-		                          1);
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
-	} else {
-		if(x < (sw * parallel_samples) && y < sh) {
-			ray_index = x + y * (sw * parallel_samples);
-		} else {
-			return;
-		}
-	}
-
-	kernel_lamp_emission((KernelGlobals *)kg,
-	                     throughput_coop,
-	                     PathRadiance_coop,
-	                     Ray_coop,
-	                     PathState_coop,
-	                     Intersection_coop,
-	                     ray_state,
-	                     sw, sh,
-	                     use_queues_flag,
-	                     ray_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
index 6d49b6294a8..8b1332bf013 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -14,101 +14,13 @@
  * limitations under the License.
  */
 
-#include "split/kernel_next_iteration_setup.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_next_iteration_setup.h"
 
-__kernel void kernel_ocl_path_trace_next_iteration_setup(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                  /* Required for setting up ray for next iteration */
-        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
-        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
-        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
-        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
-        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
-        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
-        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
-        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
-        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
-        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
-        ccl_global char *ray_state,           /* Denotes the state of each ray */
-        ccl_global int *Queue_data,           /* Queue memory */
-        ccl_global int *Queue_index,          /* Tracks the number of elements in each queue */
-        int queuesize,                        /* Size (capacity) of each queue */
-        ccl_global char *use_queues_flag)     /* flag to decide if scene_intersect kernel should
-                                               * use queues to fetch ray index */
-{
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+#define KERNEL_NAME next_iteration_setup
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
 
-	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
-		/* If we are here, then it means that scene-intersect kernel
-		* has already been executed atleast once. From the next time,
-		* scene-intersect kernel may operate on queues to fetch ray index
-		*/
-		use_queues_flag[0] = 1;
-
-		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
-		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
-		 * previous kernel.
-		 */
-		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
-	}
-
-	char enqueue_flag = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag = kernel_next_iteration_setup((KernelGlobals *)kg,
-		                                           (ShaderData *)sd,
-		                                           rng_coop,
-		                                           throughput_coop,
-		                                           PathRadiance_coop,
-		                                           Ray_coop,
-		                                           PathState_coop,
-		                                           LightRay_dl_coop,
-		                                           ISLamp_coop,
-		                                           BSDFEval_coop,
-		                                           LightRay_ao_coop,
-		                                           AOBSDF_coop,
-		                                           AOAlpha_coop,
-		                                           ray_state,
-		                                           use_queues_flag,
-		                                           ray_index);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
new file mode 100644
index 00000000000..d908af78c7a
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* For OpenCL we do manual lookup and interpolation. */
+
+ccl_device_inline ccl_global TextureInfo* kernel_tex_info(KernelGlobals *kg, uint id) {
+	const uint tex_offset = id
+#define KERNEL_TEX(type, name) + 1
+#include "kernel/kernel_textures.h"
+	;
+
+	return &((ccl_global TextureInfo*)kg->buffers[0])[tex_offset];
+}
+
+#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->cl_buffer] + info->data))[(index)]
+
+ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
+{
+	const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+	const int texture_type = kernel_tex_type(id);
+
+	/* Float4 */
+	if(texture_type == IMAGE_DATA_TYPE_FLOAT4) {
+		return tex_fetch(float4, info, offset);
+	}
+	/* Byte4 */
+	else if(texture_type == IMAGE_DATA_TYPE_BYTE4) {
+		uchar4 r = tex_fetch(uchar4, info, offset);
+		float f = 1.0f/255.0f;
+		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
+	}
+	/* Float */
+	else if(texture_type == IMAGE_DATA_TYPE_FLOAT) {
+		float f = tex_fetch(float, info, offset);
+		return make_float4(f, f, f, 1.0f);
+	}
+	/* Byte */
+	else {
+		uchar r = tex_fetch(uchar, info, offset);
+		float f = r * (1.0f/255.0f);
+		return make_float4(f, f, f, 1.0f);
+	}
+}
+
+ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
+{
+	x %= width;
+	if(x < 0)
+		x += width;
+	return x;
+}
+
+ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
+{
+	return clamp(x, 0, width-1);
+}
+
+ccl_device_inline float svm_image_texture_frac(float x, int *ix)
+{
+	int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
+	*ix = i;
+	return x - (float)i;
+}
+
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+	{ \
+		u[0] = (((-1.0f/6.0f)* t + 0.5f) * t - 0.5f) * t + (1.0f/6.0f); \
+		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
+		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
+		u[3] = (1.0f / 6.0f) * t * t * t; \
+	} (void)0
+
+ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+{
+	const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+
+	uint width = info->width;
+	uint height = info->height;
+	uint interpolation = info->interpolation;
+	uint extension = info->extension;
+
+	/* Actual sampling. */
+	if(interpolation == INTERPOLATION_CLOSEST) {
+		int ix, iy;
+		svm_image_texture_frac(x*width, &ix);
+		svm_image_texture_frac(y*height, &iy);
+
+		if(extension == EXTENSION_REPEAT) {
+			ix = svm_image_texture_wrap_periodic(ix, width);
+			iy = svm_image_texture_wrap_periodic(iy, height);
+		}
+		else {
+			if(extension == EXTENSION_CLIP) {
+				if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+			}
+			/* Fall through. */
+			/* EXTENSION_EXTEND */
+			ix = svm_image_texture_wrap_clamp(ix, width);
+			iy = svm_image_texture_wrap_clamp(iy, height);
+		}
+
+		return svm_image_texture_read(kg, id, ix + iy*width);
+	}
+	else {
+		/* Bilinear or bicubic interpolation. */
+		int ix, iy, nix, niy;
+		float tx = svm_image_texture_frac(x*width - 0.5f, &ix);
+		float ty = svm_image_texture_frac(y*height - 0.5f, &iy);
+
+		if(extension == EXTENSION_REPEAT) {
+			ix = svm_image_texture_wrap_periodic(ix, width);
+			iy = svm_image_texture_wrap_periodic(iy, height);
+			nix = svm_image_texture_wrap_periodic(ix+1, width);
+			niy = svm_image_texture_wrap_periodic(iy+1, height);
+		}
+		else {
+			if(extension == EXTENSION_CLIP) {
+				if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+			}
+			ix = svm_image_texture_wrap_clamp(ix, width);
+			iy = svm_image_texture_wrap_clamp(iy, height);
+			nix = svm_image_texture_wrap_clamp(ix+1, width);
+			niy = svm_image_texture_wrap_clamp(iy+1, height);
+		}
+
+		if(interpolation == INTERPOLATION_LINEAR) {
+			/* Bilinear interpolation. */
+			float4 r;
+			r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width);
+			r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width);
+			r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width);
+			r += ty*tx*svm_image_texture_read(kg, id, nix + niy*width);
+			return r;
+		}
+
+		/* Bicubic interpolation. */
+		int pix, piy, nnix, nniy;
+		if(extension == EXTENSION_REPEAT) {
+			pix = svm_image_texture_wrap_periodic(ix-1, width);
+			piy = svm_image_texture_wrap_periodic(iy-1, height);
+			nnix = svm_image_texture_wrap_periodic(ix+2, width);
+			nniy = svm_image_texture_wrap_periodic(iy+2, height);
+		}
+		else {
+			pix = svm_image_texture_wrap_clamp(ix-1, width);
+			piy = svm_image_texture_wrap_clamp(iy-1, height);
+			nnix = svm_image_texture_wrap_clamp(ix+2, width);
+			nniy = svm_image_texture_wrap_clamp(iy+2, height);
+		}
+
+		const int xc[4] = {pix, ix, nix, nnix};
+		const int yc[4] = {width * piy,
+		                   width * iy,
+		                   width * niy,
+		                   width * nniy};
+		float u[4], v[4];
+		/* Some helper macro to keep code reasonable size,
+		 * let compiler to inline all the matrix multiplications.
+		 */
+#define DATA(x, y) (svm_image_texture_read(kg, id, xc[x] + yc[y]))
+#define TERM(col) \
+		(v[col] * (u[0] * DATA(0, col) + \
+		           u[1] * DATA(1, col) + \
+		           u[2] * DATA(2, col) + \
+		           u[3] * DATA(3, col)))
+
+		SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+		SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+		/* Actual interpolation. */
+		return TERM(0) + TERM(1) + TERM(2) + TERM(3);
+#undef TERM
+#undef DATA
+	}
+}
+
+
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z, int interp)
+{
+	const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
+
+	uint width = info->width;
+	uint height = info->height;
+	uint depth = info->depth;
+	uint interpolation = (interp == INTERPOLATION_NONE)? info->interpolation: interp;
+	uint extension = info->extension;
+
+	/* Actual sampling. */
+	if(interpolation == INTERPOLATION_CLOSEST) {
+		int ix, iy, iz;
+		svm_image_texture_frac(x*width, &ix);
+		svm_image_texture_frac(y*height, &iy);
+		svm_image_texture_frac(z*depth, &iz);
+
+		if(extension == EXTENSION_REPEAT) {
+			ix = svm_image_texture_wrap_periodic(ix, width);
+			iy = svm_image_texture_wrap_periodic(iy, height);
+			iz = svm_image_texture_wrap_periodic(iz, depth);
+		}
+		else {
+			if(extension == EXTENSION_CLIP) {
+				if(x < 0.0f || y < 0.0f || z < 0.0f ||
+				   x > 1.0f || y > 1.0f || z > 1.0f)
+				{
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+			}
+			/* Fall through. */
+			/* EXTENSION_EXTEND */
+			ix = svm_image_texture_wrap_clamp(ix, width);
+			iy = svm_image_texture_wrap_clamp(iy, height);
+			iz = svm_image_texture_wrap_clamp(iz, depth);
+		}
+		return svm_image_texture_read(kg, id, ix + iy*width + iz*width*height);
+	}
+	else {
+		/* Bilinear or bicubic interpolation. */
+		int ix, iy, iz, nix, niy, niz;
+		float tx = svm_image_texture_frac(x*(float)width - 0.5f, &ix);
+		float ty = svm_image_texture_frac(y*(float)height - 0.5f, &iy);
+		float tz = svm_image_texture_frac(z*(float)depth - 0.5f, &iz);
+
+		if(extension == EXTENSION_REPEAT) {
+			ix = svm_image_texture_wrap_periodic(ix, width);
+			iy = svm_image_texture_wrap_periodic(iy, height);
+			iz = svm_image_texture_wrap_periodic(iz, depth);
+
+			nix = svm_image_texture_wrap_periodic(ix+1, width);
+			niy = svm_image_texture_wrap_periodic(iy+1, height);
+			niz = svm_image_texture_wrap_periodic(iz+1, depth);
+		}
+		else {
+			if(extension == EXTENSION_CLIP) {
+				if(x < 0.0f || y < 0.0f || z < 0.0f ||
+				   x > 1.0f || y > 1.0f || z > 1.0f)
+				{
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+			}
+			/* Fall through. */
+			/*  EXTENSION_EXTEND */
+			nix = svm_image_texture_wrap_clamp(ix+1, width);
+			niy = svm_image_texture_wrap_clamp(iy+1, height);
+			niz = svm_image_texture_wrap_clamp(iz+1, depth);
+
+			ix = svm_image_texture_wrap_clamp(ix, width);
+			iy = svm_image_texture_wrap_clamp(iy, height);
+			iz = svm_image_texture_wrap_clamp(iz, depth);
+		}
+
+		if(interpolation == INTERPOLATION_LINEAR) {
+			/* Bilinear interpolation. */
+			float4 r;
+			r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width + iz*width*height);
+			r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width + iz*width*height);
+			r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width + iz*width*height);
+			r += (1.0f - tz)*ty*tx*svm_image_texture_read(kg, id, nix + niy*width + iz*width*height);
+
+			r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, ix + iy*width + niz*width*height);
+			r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, nix + iy*width + niz*width*height);
+			r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, ix + niy*width + niz*width*height);
+			r += tz*ty*tx*svm_image_texture_read(kg, id, nix + niy*width + niz*width*height);
+			return r;
+		}
+
+		/* Bicubic interpolation. */
+		int pix, piy, piz, nnix, nniy, nniz;
+		if(extension == EXTENSION_REPEAT) {
+			pix = svm_image_texture_wrap_periodic(ix-1, width);
+			piy = svm_image_texture_wrap_periodic(iy-1, height);
+			piz = svm_image_texture_wrap_periodic(iz-1, depth);
+			nnix = svm_image_texture_wrap_periodic(ix+2, width);
+			nniy = svm_image_texture_wrap_periodic(iy+2, height);
+			nniz = svm_image_texture_wrap_periodic(iz+2, depth);
+		}
+		else {
+			pix = svm_image_texture_wrap_clamp(ix-1, width);
+			piy = svm_image_texture_wrap_clamp(iy-1, height);
+			piz = svm_image_texture_wrap_clamp(iz-1, depth);
+			nnix = svm_image_texture_wrap_clamp(ix+2, width);
+			nniy = svm_image_texture_wrap_clamp(iy+2, height);
+			nniz = svm_image_texture_wrap_clamp(iz+2, depth);
+		}
+
+		const int xc[4] = {pix, ix, nix, nnix};
+		const int yc[4] = {width * piy,
+		                   width * iy,
+		                   width * niy,
+		                   width * nniy};
+		const int zc[4] = {width * height * piz,
+		                   width * height * iz,
+		                   width * height * niz,
+		                   width * height * nniz};
+		float u[4], v[4], w[4];
+
+		/* Some helper macro to keep code reasonable size,
+		 * let compiler to inline all the matrix multiplications.
+		 */
+#define DATA(x, y, z) (svm_image_texture_read(kg, id, xc[x] + yc[y] + zc[z]))
+#define COL_TERM(col, row) \
+		(v[col] * (u[0] * DATA(0, col, row) + \
+		           u[1] * DATA(1, col, row) + \
+		           u[2] * DATA(2, col, row) + \
+		           u[3] * DATA(3, col, row)))
+#define ROW_TERM(row) \
+		(w[row] * (COL_TERM(0, row) + \
+		           COL_TERM(1, row) + \
+		           COL_TERM(2, row) + \
+		           COL_TERM(3, row)))
+
+		SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+		SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+		SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+		/* Actual interpolation. */
+		return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#undef COL_TERM
+#undef ROW_TERM
+#undef DATA
+	}
+}
+
+#undef SET_CUBIC_SPLINE_WEIGHTS
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
new file mode 100644
index 00000000000..fa210e747c0
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_path_init.h"
+
+#define KERNEL_NAME path_init
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
index 3156dc255fb..68ee6f1d536 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -14,93 +14,13 @@
  * limitations under the License.
  */
 
-#include "../../kernel_compat_opencl.h"
-#include "../../kernel_math.h"
-#include "../../kernel_types.h"
-#include "../../kernel_globals.h"
-#include "../../kernel_queues.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_queue_enqueue.h"
 
-/*
- * The kernel "kernel_queue_enqueue" enqueues rays of
- * different ray state into their appropriate Queues;
- * 1. Rays that have been determined to hit the background from the
- * "kernel_scene_intersect" kernel
- * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * The input and output of the kernel is as follows,
- *
- * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
- * queuesize -------------------------------------------|                           |
- *
- * Note on Queues :
- * State of queues during the first time this kernel is called :
- * At entry,
- * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
- *
- * State of queue during other times this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
- */
-__kernel void kernel_ocl_path_trace_queue_enqueue(
-        ccl_global int *Queue_data,   /* Queue memory */
-        ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
-        ccl_global char *ray_state,   /* Denotes the state of each ray */
-        int queuesize)                /* Size (capacity) of each queue */
-{
-	/* We have only 2 cases (Hit/Not-Hit) */
-	ccl_local unsigned int local_queue_atomics[2];
-
-	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-
-	if(lidx < 2 ) {
-		local_queue_atomics[lidx] = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int queue_number = -1;
-
-	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
-	}
-	else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-	}
-
-	unsigned int my_lqidx;
-	if(queue_number != -1) {
-		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	if(lidx == 0) {
-		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
-		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                                    local_queue_atomics,
-		                                    Queue_index);
-		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
-		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-		                                    local_queue_atomics,
-		                                    Queue_index);
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+#define KERNEL_NAME queue_enqueue
+#define LOCALS_TYPE QueueEnqueueLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
 
-	unsigned int my_gqidx;
-	if(queue_number != -1) {
-		my_gqidx = get_global_queue_index(queue_number,
-		                                  queuesize,
-		                                  my_lqidx,
-		                                  local_queue_atomics);
-		Queue_data[my_gqidx] = ray_index;
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
index 7f3f433c7a6..10d09377ba9 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -14,67 +14,11 @@
  * limitations under the License.
  */
 
-#include "split/kernel_scene_intersect.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_scene_intersect.h"
 
-__kernel void kernel_ocl_path_trace_scene_intersect(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global uint *rng_coop,
-        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
-        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
-        Intersection *Intersection_coop,       /* Required for scene_intersect */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global int *Queue_data,            /* Memory for queues */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
-        int queuesize,                         /* Size (capacity) of queues */
-        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
-                                                * queues to fetch ray index */
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
+#define KERNEL_NAME scene_intersect
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
 
-	/* Fetch use_queues_flag */
-	ccl_local char local_use_queues_flag;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_use_queues_flag = use_queues_flag[0];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index;
-	if(local_use_queues_flag) {
-		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-		ray_index = get_ray_index(thread_index,
-		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                          Queue_data,
-		                          queuesize,
-		                          0);
-
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
-	} else {
-		if(x < (sw * parallel_samples) && y < sh) {
-			ray_index = x + y * (sw * parallel_samples);
-		} else {
-			return;
-		}
-	}
-
-	kernel_scene_intersect((KernelGlobals *)kg,
-	                       rng_coop,
-	                       Ray_coop,
-	                       PathState_coop,
-	                       Intersection_coop,
-	                       ray_state,
-	                       sw, sh,
-	                       use_queues_flag,
-#ifdef __KERNEL_DEBUG__
-	                       debugdata_coop,
-#endif
-	                       ray_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
index c37856c8f30..40eaa561863 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -14,55 +14,11 @@
  * limitations under the License.
  */
 
-#include "split/kernel_shader_eval.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_eval.h"
 
-__kernel void kernel_ocl_path_trace_shader_eval(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                   /* Output ShaderData structure to be filled */
-        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
-        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
-        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
-        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global int *Queue_data,            /* queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize)                         /* Size (capacity) of each queue */
-{
-	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+#define KERNEL_NAME shader_eval
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
 
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-
-	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-
-	/* Continue on with shader evaluation. */
-	kernel_shader_eval((KernelGlobals *)kg,
-	                   (ShaderData *)sd,
-	                   rng_coop,
-	                   Ray_coop,
-	                   PathState_coop,
-	                   Intersection_coop,
-	                   ray_state,
-	                   ray_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
new file mode 100644
index 00000000000..8c36100f762
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_setup.h"
+
+#define KERNEL_NAME shader_setup
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
new file mode 100644
index 00000000000..bcacaa4a054
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_sort.h"
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+#define KERNEL_NAME shader_sort
+#define LOCALS_TYPE ShaderSortLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
deleted file mode 100644
index edf76fba714..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "split/kernel_shadow_blocked.h"
-
-__kernel void kernel_ocl_path_trace_shadow_blocked(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
-        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
-        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
-        ccl_global char *ray_state,
-        ccl_global int *Queue_data,            /* Queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize)                         /* Size (capacity) of each queue */
-{
-	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
-
-	ccl_local unsigned int ao_queue_length;
-	ccl_local unsigned int dl_queue_length;
-	if(lidx == 0) {
-		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
-		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	/* flag determining if the current ray is to process shadow ray for AO or DL */
-	char shadow_blocked_type = -1;
-
-	int ray_index = QUEUE_EMPTY_SLOT;
-	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	if(thread_index < ao_queue_length + dl_queue_length) {
-		if(thread_index < ao_queue_length) {
-			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
-			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
-		} else {
-			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
-			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
-		}
-	}
-
-	if(ray_index == QUEUE_EMPTY_SLOT)
-		return;
-
-	kernel_shadow_blocked((KernelGlobals *)kg,
-	                      PathState_coop,
-	                      LightRay_dl_coop,
-	                      LightRay_ao_coop,
-	                      ray_state,
-	                      shadow_blocked_type,
-	                      ray_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
new file mode 100644
index 00000000000..8de250a375c
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shadow_blocked_ao.h"
+
+#define KERNEL_NAME shadow_blocked_ao
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
new file mode 100644
index 00000000000..29da77022ed
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shadow_blocked_dl.h"
+
+#define KERNEL_NAME shadow_blocked_dl
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
new file mode 100644
index 00000000000..4cbda1bc2e7
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"  // PRECOMPILED
+#include "kernel/split/kernel_split_common.h"  // PRECOMPILED
+
+#include "kernel/kernels/opencl/kernel_state_buffer_size.cl"
+#include "kernel/kernels/opencl/kernel_data_init.cl"
+#include "kernel/kernels/opencl/kernel_path_init.cl"
+
+#include "kernel/kernels/opencl/kernel_scene_intersect.cl"
+#include "kernel/kernels/opencl/kernel_lamp_emission.cl"
+#include "kernel/kernels/opencl/kernel_do_volume.cl"
+#include "kernel/kernels/opencl/kernel_indirect_background.cl"
+#include "kernel/kernels/opencl/kernel_queue_enqueue.cl"
+#include "kernel/kernels/opencl/kernel_shader_setup.cl"
+#include "kernel/kernels/opencl/kernel_shader_sort.cl"
+#include "kernel/kernels/opencl/kernel_shader_eval.cl"
+#include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl"
+#include "kernel/kernels/opencl/kernel_subsurface_scatter.cl"
+#include "kernel/kernels/opencl/kernel_direct_lighting.cl"
+#include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl"
+#include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl"
+#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl"
+#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
+#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
+#include "kernel/kernels/opencl/kernel_buffer_update.cl"
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
new file mode 100644
index 00000000000..6aa7681cbed
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define KERNEL_NAME_JOIN(a, b) a ## _ ## b
+#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b)
+
+__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
+		ccl_global char *kg_global,
+		ccl_constant KernelData *data,
+
+		ccl_global void *split_data_buffer,
+		ccl_global char *ray_state,
+
+		KERNEL_BUFFER_PARAMS,
+
+		ccl_global int *queue_index,
+		ccl_global char *use_queues_flag,
+		ccl_global unsigned int *work_pools,
+		ccl_global float *buffer
+	)
+{
+#ifdef LOCALS_TYPE
+	ccl_local LOCALS_TYPE locals;
+#endif
+
+	KernelGlobals *kg = (KernelGlobals*)kg_global;
+
+	if(ccl_local_id(0) + ccl_local_id(1) == 0) {
+		kg->data = data;
+
+		kernel_split_params.queue_index = queue_index;
+		kernel_split_params.use_queues_flag = use_queues_flag;
+		kernel_split_params.work_pools = work_pools;
+		kernel_split_params.tile.buffer = buffer;
+
+		split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state);
+
+	}
+
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+
+	KERNEL_NAME_EVAL(kernel, KERNEL_NAME)(
+			kg
+#ifdef LOCALS_TYPE
+			, &locals
+#endif
+		);
+}
+
+#undef KERNEL_NAME_JOIN
+#undef KERNEL_NAME_EVAL
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
new file mode 100644
index 00000000000..c10ecc426c6
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+
+__kernel void kernel_ocl_path_trace_state_buffer_size(
+        ccl_global char *kg,
+        ccl_constant KernelData *data,
+        uint num_threads,
+        ccl_global uint64_t *size)
+{
+	((KernelGlobals*)kg)->data = data;
+	*size = split_data_buffer_size((KernelGlobals*)kg, num_threads);
+}
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
new file mode 100644
index 00000000000..2b3be38df84
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_subsurface_scatter.h"
+
+#define KERNEL_NAME subsurface_scatter
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
deleted file mode 100644
index 88a1ed830af..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "split/kernel_sum_all_radiance.h"
-
-__kernel void kernel_ocl_path_trace_sum_all_radiance(
-        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
-        ccl_global float *buffer,                    /* Output buffer of RenderTile */
-        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
-        int parallel_samples, int sw, int sh, int stride,
-        int buffer_offset_x,
-        int buffer_offset_y,
-        int buffer_stride,
-        int start_sample)
-{
-	kernel_sum_all_radiance(data,
-	                        buffer,
-	                        per_sample_output_buffer,
-	                        parallel_samples,
-	                        sw, sh, stride,
-	                        buffer_offset_x,
-	                        buffer_offset_y,
-	                        buffer_stride,
-	                        start_sample);
-}
diff --git a/intern/cycles/kernel/openvdb/vdb_intern.h b/intern/cycles/kernel/openvdb/vdb_intern.h
index 71d6b81e0ff..0ebb0eed094 100644
--- a/intern/cycles/kernel/openvdb/vdb_intern.h
+++ b/intern/cycles/kernel/openvdb/vdb_intern.h
@@ -33,7 +33,7 @@
 #	pragma GCC diagnostic pop
 #endif
 
-#include "util_vector.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt
index 98de40e5a8a..d2eb89e0e0a 100644
--- a/intern/cycles/kernel/osl/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/CMakeLists.txt
@@ -1,12 +1,6 @@
 
 set(INC
-	.
-	..
-	../svm
-	../../graph
-	../../render
-	../../util
-	../../device
+	../..
 )
 
 set(INC_SYS
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
index d835f9be45c..2e73e7a601e 100644
--- a/intern/cycles/kernel/osl/background.cpp
+++ b/intern/cycles/kernel/osl/background.cpp
@@ -34,10 +34,10 @@
 
 #include <OSL/genclosure.h>
 
-#include "osl_closures.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_compat_cpu.h"
-#include "closure/alloc.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/closure/alloc.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index bc26f42b559..ea18f2c8c86 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -34,13 +34,13 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel_compat_cpu.h"
-#include "osl_closures.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_types.h"
-#include "kernel_montecarlo.h"
-#include "closure/alloc.h"
-#include "closure/bsdf_diffuse_ramp.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_diffuse_ramp.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index 14c7644936e..a26671eb09e 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -34,12 +34,12 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel_compat_cpu.h"
-#include "osl_closures.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_types.h"
-#include "closure/alloc.h"
-#include "closure/bsdf_phong_ramp.h"
+#include "kernel/kernel_types.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_phong_ramp.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
index 3f13e08b302..8843a196dad 100644
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ b/intern/cycles/kernel/osl/emissive.cpp
@@ -34,12 +34,12 @@
 
 #include <OSL/genclosure.h>
 
-#include "osl_closures.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_compat_cpu.h"
-#include "kernel_types.h"
-#include "closure/alloc.h"
-#include "closure/emissive.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/kernel_types.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/emissive.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 3614717e28c..27a96720c1e 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -32,15 +32,17 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel_compat_cpu.h"
-#include "osl_closures.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_types.h"
-#include "kernel_montecarlo.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_montecarlo.h"
 
-#include "closure/alloc.h"
-#include "closure/bsdf_diffuse.h"
-#include "closure/bssrdf.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bssrdf.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -78,7 +80,8 @@ public:
 				bssrdf->albedo = albedo.x;
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+				bssrdf->roughness = params.roughness;
+				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 
 			bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f));
@@ -89,7 +92,8 @@ public:
 				bssrdf->albedo = albedo.y;
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+				bssrdf->roughness = params.roughness;
+				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 
 			bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z));
@@ -100,7 +104,8 @@ public:
 				bssrdf->albedo = albedo.z;
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+				bssrdf->roughness = params.roughness;
+				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 		}
 	}
@@ -180,5 +185,31 @@ ClosureParam *closure_bssrdf_burley_params()
 
 CCLOSURE_PREPARE(closure_bssrdf_burley_prepare, BurleyBSSRDFClosure)
 
+/* Disney principled */
+
+class PrincipledBSSRDFClosure : public CBSSRDFClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID);
+	}
+};
+
+ClosureParam *closure_bssrdf_principled_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, params.N),
+		CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, radius),
+		CLOSURE_FLOAT_PARAM(PrincipledBSSRDFClosure, params.texture_blur),
+		CLOSURE_FLOAT3_PARAM(PrincipledBSSRDFClosure, albedo),
+		CLOSURE_FLOAT_PARAM(PrincipledBSSRDFClosure, params.roughness),
+		CLOSURE_STRING_KEYPARAM(PrincipledBSSRDFClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(PrincipledBSSRDFClosure)
+	};
+	return params;
+}
+
+CCLOSURE_PREPARE(closure_bssrdf_principled_prepare, PrincipledBSSRDFClosure)
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 94de782dca0..14c5c1c3db5 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -33,33 +33,36 @@
 #include <OSL/genclosure.h>
 #include <OSL/oslclosure.h>
 
-#include "osl_closures.h"
-#include "osl_shader.h"
-
-#include "util_debug.h"
-#include "util_math.h"
-#include "util_param.h"
-
-#include "kernel_types.h"
-#include "kernel_compat_cpu.h"
-#include "kernel_globals.h"
-#include "kernel_montecarlo.h"
-#include "kernel_random.h"
-
-#include "closure/alloc.h"
-#include "closure/bsdf_util.h"
-#include "closure/bsdf_ashikhmin_velvet.h"
-#include "closure/bsdf_diffuse.h"
-#include "closure/bsdf_microfacet.h"
-#include "closure/bsdf_microfacet_multi.h"
-#include "closure/bsdf_oren_nayar.h"
-#include "closure/bsdf_reflection.h"
-#include "closure/bsdf_refraction.h"
-#include "closure/bsdf_transparent.h"
-#include "closure/bsdf_ashikhmin_shirley.h"
-#include "closure/bsdf_toon.h"
-#include "closure/bsdf_hair.h"
-#include "closure/volume.h"
+#include "kernel/osl/osl_closures.h"
+#include "kernel/osl/osl_shader.h"
+
+#include "util/util_debug.h"
+#include "util/util_math.h"
+#include "util/util_param.h"
+
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_random.h"
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/bsdf_ashikhmin_velvet.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_microfacet.h"
+#include "kernel/closure/bsdf_microfacet_multi.h"
+#include "kernel/closure/bsdf_oren_nayar.h"
+#include "kernel/closure/bsdf_reflection.h"
+#include "kernel/closure/bsdf_refraction.h"
+#include "kernel/closure/bsdf_transparent.h"
+#include "kernel/closure/bsdf_ashikhmin_shirley.h"
+#include "kernel/closure/bsdf_toon.h"
+#include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
+#include "kernel/closure/volume.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -153,7 +156,7 @@ BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refra
 BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction)
 
 BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, unused),
+	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.N),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness1),
 	CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness2),
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T),
@@ -161,7 +164,7 @@ BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY
 BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection)
 
 BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, HairBsdf, LABEL_GLOSSY)
-	CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, unused),
+	CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, params.N),
 	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness1),
 	CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness2),
 	CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T),
@@ -175,6 +178,63 @@ VOLUME_CLOSURE_CLASS_END(VolumeHenyeyGreenstein, henyey_greenstein)
 VOLUME_CLOSURE_CLASS_BEGIN(VolumeAbsorption, absorption, ShaderClosure, LABEL_SINGULAR)
 VOLUME_CLOSURE_CLASS_END(VolumeAbsorption, absorption)
 
+BSDF_CLOSURE_CLASS_BEGIN(PrincipledDiffuse, principled_diffuse, PrincipledDiffuseBsdf, LABEL_DIFFUSE)
+	CLOSURE_FLOAT3_PARAM(PrincipledDiffuseClosure, params.N),
+	CLOSURE_FLOAT_PARAM(PrincipledDiffuseClosure, params.roughness),
+BSDF_CLOSURE_CLASS_END(PrincipledDiffuse, principled_diffuse)
+
+BSDF_CLOSURE_CLASS_BEGIN(PrincipledSheen, principled_sheen, PrincipledSheenBsdf, LABEL_DIFFUSE)
+	CLOSURE_FLOAT3_PARAM(PrincipledSheenClosure, params.N),
+BSDF_CLOSURE_CLASS_END(PrincipledSheen, principled_sheen)
+
+/* DISNEY PRINCIPLED CLEARCOAT */
+class PrincipledClearcoatClosure : public CBSDFClosure {
+public:
+	MicrofacetBsdf params;
+	float clearcoat, clearcoat_roughness;
+
+	MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+		MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+		if(bsdf && extra) {
+			bsdf->extra = extra;
+
+			bsdf->ior = 1.5f;
+
+			bsdf->alpha_x = clearcoat_roughness;
+			bsdf->alpha_y = clearcoat_roughness;
+
+			bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f);
+			bsdf->extra->clearcoat = clearcoat;
+
+			return bsdf;
+		}
+
+		return NULL;
+	}
+
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd) : 0;
+	}
+};
+
+ClosureParam *closure_bsdf_principled_clearcoat_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(PrincipledClearcoatClosure, params.N),
+		CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat),
+		CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat_roughness),
+		CLOSURE_STRING_KEYPARAM(PrincipledClearcoatClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(PrincipledClearcoatClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_principled_clearcoat_prepare, PrincipledClearcoatClosure)
+
+
 /* Registration */
 
 static void register_closure(OSL::ShadingSystem *ss, const char *name, int id, OSL::ClosureParam *params, OSL::PrepareClosureFunc prepare)
@@ -214,6 +274,16 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		closure_bsdf_microfacet_multi_ggx_glass_params(), closure_bsdf_microfacet_multi_ggx_glass_prepare);
 	register_closure(ss, "microfacet_multi_ggx_aniso", id++,
 		closure_bsdf_microfacet_multi_ggx_aniso_params(), closure_bsdf_microfacet_multi_ggx_aniso_prepare);
+	register_closure(ss, "microfacet_ggx_fresnel", id++,
+		closure_bsdf_microfacet_ggx_fresnel_params(), closure_bsdf_microfacet_ggx_fresnel_prepare);
+	register_closure(ss, "microfacet_ggx_aniso_fresnel", id++,
+		closure_bsdf_microfacet_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_ggx_aniso_fresnel_prepare);
+	register_closure(ss, "microfacet_multi_ggx_fresnel", id++,
+		closure_bsdf_microfacet_multi_ggx_fresnel_params(), closure_bsdf_microfacet_multi_ggx_fresnel_prepare);
+	register_closure(ss, "microfacet_multi_ggx_glass_fresnel", id++,
+		closure_bsdf_microfacet_multi_ggx_glass_fresnel_params(), closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare);
+	register_closure(ss, "microfacet_multi_ggx_aniso_fresnel", id++,
+		closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params(), closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare);
 	register_closure(ss, "microfacet_beckmann", id++,
 		bsdf_microfacet_beckmann_params(), bsdf_microfacet_beckmann_prepare);
 	register_closure(ss, "microfacet_beckmann_aniso", id++,
@@ -228,6 +298,12 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare);
 	register_closure(ss, "glossy_toon", id++,
 		bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare);
+	register_closure(ss, "principled_diffuse", id++,
+		bsdf_principled_diffuse_params(), bsdf_principled_diffuse_prepare);
+	register_closure(ss, "principled_sheen", id++,
+		bsdf_principled_sheen_params(), bsdf_principled_sheen_prepare);
+	register_closure(ss, "principled_clearcoat", id++,
+		closure_bsdf_principled_clearcoat_params(), closure_bsdf_principled_clearcoat_prepare);
 
 	register_closure(ss, "emission", id++,
 		closure_emission_params(), closure_emission_prepare);
@@ -247,6 +323,8 @@ void OSLShader::register_closures(OSLShadingSystem *ss_)
 		closure_bssrdf_gaussian_params(), closure_bssrdf_gaussian_prepare);
 	register_closure(ss, "bssrdf_burley", id++,
 		closure_bssrdf_burley_params(), closure_bssrdf_burley_prepare);
+	register_closure(ss, "bssrdf_principled", id++,
+		closure_bssrdf_principled_params(), closure_bssrdf_principled_prepare);
 
 	register_closure(ss, "hair_reflection", id++,
 		bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare);
@@ -277,6 +355,86 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering)
 	return false;
 }
 
+
+/* GGX closures with Fresnel */
+
+class MicrofacetFresnelClosure : public CBSDFClosure {
+public:
+	MicrofacetBsdf params;
+	float3 color;
+	float3 cspec0;
+
+	MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+	{
+		/* Technically, the MultiGGX Glass closure may also transmit. However,
+		* since this is set statically and only used for caustic flags, this
+		* is probably as good as it gets. */
+		if(!skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+			MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+			if(bsdf && extra) {
+				bsdf->extra = extra;
+				bsdf->extra->color = color;
+				bsdf->extra->cspec0 = cspec0;
+				return bsdf;
+			}
+		}
+
+		return NULL;
+	}
+};
+
+class MicrofacetGGXFresnelClosure : public MicrofacetFresnelClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_ggx_fresnel_setup(bsdf, sd) : 0;
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_fresnel_prepare, MicrofacetGGXFresnelClosure);
+
+class MicrofacetGGXAnisoFresnelClosure : public MicrofacetFresnelClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd) : 0;
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.T),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_y),
+		CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_aniso_fresnel_prepare, MicrofacetGGXAnisoFresnelClosure);
+
+
 /* Multiscattering GGX closures */
 
 class MicrofacetMultiClosure : public CBSDFClosure {
@@ -286,7 +444,7 @@ public:
 
 	MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
 	{
-		/* Technically, the MultiGGX Glass closure may also transmit. However,
+		/* Technically, the MultiGGX closure may also transmit. However,
 		 * since this is set statically and only used for caustic flags, this
 		 * is probably as good as it gets. */
 	    if(!skip(sd, path_flag, LABEL_GLOSSY|LABEL_REFLECT)) {
@@ -374,5 +532,110 @@ ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params()
 }
 CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_prepare, MicrofacetMultiGGXGlassClosure);
 
+
+/* Multiscattering GGX closures with Fresnel */
+
+class MicrofacetMultiFresnelClosure : public CBSDFClosure {
+public:
+	MicrofacetBsdf params;
+	float3 color;
+	float3 cspec0;
+
+	MicrofacetBsdf *alloc(ShaderData *sd, int path_flag, float3 weight)
+	{
+		/* Technically, the MultiGGX closure may also transmit. However,
+		* since this is set statically and only used for caustic flags, this
+		* is probably as good as it gets. */
+		if(!skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc_osl(sd, sizeof(MicrofacetBsdf), weight, &params);
+			MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+			if(bsdf && extra) {
+				bsdf->extra = extra;
+				bsdf->extra->color = color;
+				bsdf->extra->cspec0 = cspec0;
+				return bsdf;
+			}
+		}
+
+		return NULL;
+	}
+};
+
+class MicrofacetMultiGGXFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd) : 0;
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_fresnel_prepare, MicrofacetMultiGGXFresnelClosure);
+
+class MicrofacetMultiGGXAnisoFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd) : 0;
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.T),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_y),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare, MicrofacetMultiGGXAnisoFresnelClosure);
+
+class MicrofacetMultiGGXGlassFresnelClosure : public MicrofacetMultiFresnelClosure {
+public:
+	MicrofacetMultiGGXGlassFresnelClosure() : MicrofacetMultiFresnelClosure() {}
+
+	void setup(ShaderData *sd, int path_flag, float3 weight)
+	{
+		MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
+		sd->flag |= (bsdf) ? bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd) : 0;
+	}
+};
+
+ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params()
+{
+	static ClosureParam params[] = {
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
+		CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
+		CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
+		CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
+		CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)
+	};
+	return params;
+}
+CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare, MicrofacetMultiGGXGlassFresnelClosure);
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index cd7b33703ff..ff5fd9cc905 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -33,8 +33,8 @@
 #ifndef __OSL_CLOSURES_H__
 #define __OSL_CLOSURES_H__
 
-#include "util_types.h"
-#include "kernel_types.h"
+#include "util/util_types.h"
+#include "kernel/kernel_types.h"
 
 #include <OSL/oslclosure.h>
 #include <OSL/oslexec.h>
@@ -51,10 +51,17 @@ OSL::ClosureParam *closure_bsdf_phong_ramp_params();
 OSL::ClosureParam *closure_bssrdf_cubic_params();
 OSL::ClosureParam *closure_bssrdf_gaussian_params();
 OSL::ClosureParam *closure_bssrdf_burley_params();
+OSL::ClosureParam *closure_bssrdf_principled_params();
 OSL::ClosureParam *closure_henyey_greenstein_volume_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params();
 OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params();
+OSL::ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params();
+OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params();
+OSL::ClosureParam *closure_bsdf_principled_clearcoat_params();
 
 void closure_emission_prepare(OSL::RendererServices *, int id, void *data);
 void closure_background_prepare(OSL::RendererServices *, int id, void *data);
@@ -65,10 +72,17 @@ void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data
 void closure_bssrdf_cubic_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bssrdf_gaussian_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bssrdf_burley_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bssrdf_principled_prepare(OSL::RendererServices *, int id, void *data);
 void closure_henyey_greenstein_volume_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_glass_prepare(OSL::RendererServices *, int id, void *data);
 void closure_bsdf_microfacet_multi_ggx_aniso_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare(OSL::RendererServices *, int id, void *data);
+void closure_bsdf_principled_clearcoat_prepare(OSL::RendererServices *, int id, void *data);
 
 #define CCLOSURE_PREPARE(name, classname)          \
 void name(RendererServices *, int id, void *data) \
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 65cb7ecc6b4..9585d9f4825 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -21,10 +21,10 @@
 
 #include <OSL/oslexec.h>
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_thread.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
 
 #ifndef WIN32
 using std::isfinite;
@@ -86,7 +86,7 @@ struct OSLThreadData {
 	OSL::ShaderGlobals globals;
 	OSL::PerThreadInfo *osl_thread_info;
 	OSLTraceData tracedata;
-	OSL::ShadingContext *context[SHADER_CONTEXT_NUM];
+	OSL::ShadingContext *context;
 	OIIO::TextureSystem::Perthread *oiio_thread_info;
 };
 
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index bc093272eca..c220a5ee3a1 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -25,38 +25,38 @@
 
 #include <string.h>
 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-
-#include "osl_closures.h"
-#include "osl_globals.h"
-#include "osl_services.h"
-#include "osl_shader.h"
-
-#include "kernel_compat_cpu.h"
-#include "kernel_globals.h"
-#include "kernel_random.h"
-#include "kernel_projection.h"
-#include "kernel_differential.h"
-#include "kernel_montecarlo.h"
-#include "kernel_camera.h"
-
-#include "kernels/cpu/kernel_cpu_image.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+
+#include "kernel/osl/osl_closures.h"
+#include "kernel/osl/osl_globals.h"
+#include "kernel/osl/osl_services.h"
+#include "kernel/osl/osl_shader.h"
+
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernels/cpu/kernel_cpu_image.h"
+#include "kernel/geom/geom.h"
+#include "kernel/bvh/bvh.h"
 
 /* Note: "util_foreach.h" needs to be included after "kernel_compat_cpu.h", as
  * for some reason ccl::foreach conflicts with openvdb::tools::foreach, which is
  * indirectly included through "kernel_compat_cpu.h".
  */
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_string.h"
-#include "geom/geom.h"
-#include "bvh/bvh.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_string.h"
 
-#include "kernel_projection.h"
-#include "kernel_accumulate.h"
-#include "kernel_shader.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_shader.h"
 
 #ifdef WITH_PTEX
 #  include <Ptexture.h>
@@ -107,6 +107,8 @@ ustring OSLRenderServices::u_curve_tangent_normal("geom:curve_tangent_normal");
 #endif
 ustring OSLRenderServices::u_path_ray_length("path:ray_length");
 ustring OSLRenderServices::u_path_ray_depth("path:ray_depth");
+ustring OSLRenderServices::u_path_diffuse_depth("path:diffuse_depth");
+ustring OSLRenderServices::u_path_glossy_depth("path:glossy_depth");
 ustring OSLRenderServices::u_path_transparent_depth("path:transparent_depth");
 ustring OSLRenderServices::u_path_transmission_depth("path:transmission_depth");
 ustring OSLRenderServices::u_trace("trace");
@@ -715,7 +717,7 @@ bool OSLRenderServices::get_object_standard_attribute(KernelGlobals *kg, ShaderD
 		else
 			motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, P);
 
-		if(!(sd->flag & SD_TRANSFORM_APPLIED)) {
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &P[0]);
 			object_position_transform(kg, sd, &P[1]);
 			object_position_transform(kg, sd, &P[2]);
@@ -764,6 +766,24 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
 		int f = state->bounce;
 		return set_attribute_int(f, type, derivatives, val);
 	}
+	else if(name == u_path_diffuse_depth) {
+		/* Diffuse Ray Depth */
+		PathState *state = sd->osl_path_state;
+		int f = state->diffuse_bounce;
+		return set_attribute_int(f, type, derivatives, val);
+	}
+	else if(name == u_path_glossy_depth) {
+		/* Glossy Ray Depth */
+		PathState *state = sd->osl_path_state;
+		int f = state->glossy_bounce;
+		return set_attribute_int(f, type, derivatives, val);
+	}
+	else if(name == u_path_transmission_depth) {
+		/* Transmission Ray Depth */
+		PathState *state = sd->osl_path_state;
+		int f = state->transmission_bounce;
+		return set_attribute_int(f, type, derivatives, val);
+	}
 	else if(name == u_path_transparent_depth) {
 		/* Transparent Ray Depth */
 		PathState *state = sd->osl_path_state;
@@ -808,7 +828,7 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
 bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg, bool derivatives, ustring object_name,
                                       TypeDesc type, ustring name, void *val)
 {
-	if(sg->renderstate == NULL)
+	if(sg == NULL || sg->renderstate == NULL)
 		return false;
 
 	ShaderData *sd = (ShaderData *)(sg->renderstate);
@@ -946,7 +966,7 @@ bool OSLRenderServices::texture(ustring filename,
 
 	if(filename.length() && filename[0] == '@') {
 		int slot = atoi(filename.c_str() + 1);
-		float4 rgba = kernel_tex_image_interp(slot, s, 1.0f - t);
+		float4 rgba = kernel_tex_image_interp(kg, slot, s, 1.0f - t);
 
 		result[0] = rgba[0];
 		if(nchannels > 1)
@@ -1027,7 +1047,7 @@ bool OSLRenderServices::texture3d(ustring filename,
 	bool status;
 	if(filename.length() && filename[0] == '@') {
 		int slot = atoi(filename.c_str() + 1);
-		float4 rgba = kernel_tex_image_interp_3d(slot, P.x, P.y, P.z);
+		float4 rgba = kernel_tex_image_interp_3d(kg, slot, P.x, P.y, P.z, INTERPOLATION_NONE);
 
 		result[0] = rgba[0];
 		if(nchannels > 1)
@@ -1181,8 +1201,9 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
 	tracedata->init = true;
 	tracedata->sd.osl_globals = sd->osl_globals;
 
-	/* raytrace */
-	return scene_intersect(sd->osl_globals, ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
+	/* Raytrace, leaving out shadow opaque to avoid early exit. */
+	uint visibility = PATH_RAY_ALL_VISIBILITY - PATH_RAY_SHADOW_OPAQUE;
+	return scene_intersect(sd->osl_globals, ray, visibility, &tracedata->isect, NULL, 0.0f, 0.0f);
 }
 
 
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 0f2e02c62b0..ec34ca77115 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -165,6 +165,8 @@ public:
 	static ustring u_curve_tangent_normal;
 	static ustring u_path_ray_length;
 	static ustring u_path_ray_depth;
+	static ustring u_path_diffuse_depth;
+	static ustring u_path_glossy_depth;
 	static ustring u_path_transparent_depth;
 	static ustring u_path_transmission_depth;
 	static ustring u_trace;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 0d762bbdb38..6b3a996ca12 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -16,21 +16,22 @@
 
 #include <OSL/oslexec.h>
 
-#include "kernel_compat_cpu.h"
-#include "kernel_montecarlo.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
 
-#include "geom/geom_object.h"
+#include "kernel/geom/geom_object.h"
 
-#include "osl_closures.h"
-#include "osl_globals.h"
-#include "osl_services.h"
-#include "osl_shader.h"
+#include "kernel/osl/osl_closures.h"
+#include "kernel/osl/osl_globals.h"
+#include "kernel/osl/osl_services.h"
+#include "kernel/osl/osl_shader.h"
 
-#include "util_foreach.h"
+#include "util/util_foreach.h"
 
-#include "attribute.h"
+#include "render/attribute.h"
 
 
 CCL_NAMESPACE_BEGIN
@@ -56,9 +57,7 @@ void OSLShader::thread_init(KernelGlobals *kg, KernelGlobals *kernel_globals, OS
 	tdata->globals.tracedata = &tdata->tracedata;
 	tdata->globals.flipHandedness = false;
 	tdata->osl_thread_info = ss->create_thread_info();
-
-	for(int i = 0; i < SHADER_CONTEXT_NUM; i++)
-		tdata->context[i] = ss->get_context(tdata->osl_thread_info);
+	tdata->context = ss->get_context(tdata->osl_thread_info);
 
 	tdata->oiio_thread_info = osl_globals->ts->get_perthread_info();
 
@@ -73,9 +72,7 @@ void OSLShader::thread_free(KernelGlobals *kg)
 
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSLThreadData *tdata = kg->osl_tdata;
-
-	for(int i = 0; i < SHADER_CONTEXT_NUM; i++)
-		ss->release_context(tdata->context[i]);
+	ss->release_context(tdata->context);
 
 	ss->destroy_thread_info(tdata->osl_thread_info);
 
@@ -172,7 +169,7 @@ static void flatten_surface_closure_tree(ShaderData *sd,
 	}
 }
 
-void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx)
+void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
 {
 	/* setup shader globals from shader data */
 	OSLThreadData *tdata = kg->osl_tdata;
@@ -181,7 +178,7 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state
 	/* execute shader for this point */
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSL::ShaderGlobals *globals = &tdata->globals;
-	OSL::ShadingContext *octx = tdata->context[(int)ctx];
+	OSL::ShadingContext *octx = tdata->context;
 	int shader = sd->shader & SHADER_MASK;
 
 	/* automatic bump shader */
@@ -273,7 +270,7 @@ static void flatten_background_closure_tree(ShaderData *sd,
 	}
 }
 
-void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx)
+void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
 {
 	/* setup shader globals from shader data */
 	OSLThreadData *tdata = kg->osl_tdata;
@@ -282,7 +279,7 @@ void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *st
 	/* execute shader for this point */
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSL::ShaderGlobals *globals = &tdata->globals;
-	OSL::ShadingContext *octx = tdata->context[(int)ctx];
+	OSL::ShadingContext *octx = tdata->context;
 
 	if(kg->osl->background_state) {
 		ss->execute(octx, *(kg->osl->background_state), *globals);
@@ -328,7 +325,7 @@ static void flatten_volume_closure_tree(ShaderData *sd,
 	}
 }
 
-void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx)
+void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
 {
 	/* setup shader globals from shader data */
 	OSLThreadData *tdata = kg->osl_tdata;
@@ -337,7 +334,7 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state,
 	/* execute shader */
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSL::ShaderGlobals *globals = &tdata->globals;
-	OSL::ShadingContext *octx = tdata->context[(int)ctx];
+	OSL::ShadingContext *octx = tdata->context;
 	int shader = sd->shader & SHADER_MASK;
 
 	if(kg->osl->volume_state[shader]) {
@@ -351,19 +348,17 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state,
 
 /* Displacement */
 
-void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx)
+void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state)
 {
 	/* setup shader globals from shader data */
 	OSLThreadData *tdata = kg->osl_tdata;
 
-	PathState state = {0};
-
-	shaderdata_to_shaderglobals(kg, sd, &state, 0, tdata);
+	shaderdata_to_shaderglobals(kg, sd, state, 0, tdata);
 
 	/* execute shader */
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSL::ShaderGlobals *globals = &tdata->globals;
-	OSL::ShadingContext *octx = tdata->context[(int)ctx];
+	OSL::ShadingContext *octx = tdata->context;
 	int shader = sd->shader & SHADER_MASK;
 
 	if(kg->osl->displacement_state[shader]) {
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index ad06dd6929d..6b392b25cf7 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -29,7 +29,7 @@
  * This means no thread state must be passed along in the kernel itself.
  */
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -53,10 +53,10 @@ public:
 	static void thread_free(KernelGlobals *kg);
 
 	/* eval */
-	static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
-	static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
-	static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
-	static void eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx);
+	static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
+	static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
+	static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
+	static void eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state);
 
 	/* attributes */
 	static int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeDescriptor *desc);
diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt
index b43f8402d42..1a8ed4c884a 100644
--- a/intern/cycles/kernel/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/shaders/CMakeLists.txt
@@ -81,13 +81,15 @@ set(SRC_OSL
 	node_wireframe.osl
 	node_hair_bsdf.osl
 	node_uv_map.osl
+	node_principled_bsdf.osl
 	node_rgb_to_bw.osl
 )
 
 set(SRC_OSL_HEADERS
-	node_texture.h
 	node_color.h
 	node_fresnel.h
+	node_ramp_util.h
+	node_texture.h
 	stdosl.h
 	oslutil.h
 )
diff --git a/intern/cycles/kernel/shaders/node_light_path.osl b/intern/cycles/kernel/shaders/node_light_path.osl
index a021a40467d..64fe4c20132 100644
--- a/intern/cycles/kernel/shaders/node_light_path.osl
+++ b/intern/cycles/kernel/shaders/node_light_path.osl
@@ -27,6 +27,8 @@ shader node_light_path(
 	output float IsVolumeScatterRay = 0.0,
 	output float RayLength = 0.0,
 	output float RayDepth = 0.0,
+	output float DiffuseDepth = 0.0,
+	output float GlossyDepth = 0.0,
 	output float TransparentDepth = 0.0,
 	output float TransmissionDepth = 0.0)
 {
@@ -45,6 +47,14 @@ shader node_light_path(
 	getattribute("path:ray_depth", ray_depth);
 	RayDepth = (float)ray_depth;
 
+	int diffuse_depth;
+	getattribute("path:diffuse_depth", diffuse_depth);
+	DiffuseDepth = (float)diffuse_depth;
+
+	int glossy_depth;
+	getattribute("path:glossy_depth", glossy_depth);
+	GlossyDepth = (float)glossy_depth;
+
 	int transparent_depth;
 	getattribute("path:transparent_depth", transparent_depth);
 	TransparentDepth = (float)transparent_depth;
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
new file mode 100644
index 00000000000..6870d479af3
--- /dev/null
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stdosl.h"
+#include "node_fresnel.h"
+
+shader node_principled_bsdf(
+	string distribution = "Multiscatter GGX",
+	color BaseColor = color(0.8, 0.8, 0.8),
+	float Subsurface = 0.0,
+	vector SubsurfaceRadius = vector(1.0, 1.0, 1.0),
+	color SubsurfaceColor = color(0.7, 0.1, 0.1),
+	float Metallic = 0.0,
+	float Specular = 0.5,
+	float SpecularTint = 0.0,
+	float Roughness = 0.5,
+	float Anisotropic = 0.0,
+	float AnisotropicRotation = 0.0,
+	float Sheen = 0.0,
+	float SheenTint = 0.5,
+	float Clearcoat = 0.0,
+	float ClearcoatRoughness = 0.03,
+	float IOR = 1.45,
+	float Transmission = 0.0,
+	float TransmissionRoughness = 0.0,
+	normal Normal = N,
+	normal ClearcoatNormal = N,
+	normal Tangent = normalize(dPdu),
+	output closure color BSDF = 0)
+{
+	float f = max(IOR, 1e-5);
+	float diffuse_weight = (1.0 - clamp(Metallic, 0.0, 1.0)) * (1.0 - clamp(Transmission, 0.0, 1.0));
+	float final_transmission = clamp(Transmission, 0.0, 1.0) * (1.0 - clamp(Metallic, 0.0, 1.0));
+	float specular_weight = (1.0 - final_transmission);
+
+	vector T = Tangent;
+
+	float m_cdlum = luminance(BaseColor);
+	color m_ctint = m_cdlum > 0.0 ? BaseColor / m_cdlum : color(0.0, 0.0, 0.0); // normalize lum. to isolate hue+sat
+
+	/* rotate tangent */
+	if (AnisotropicRotation != 0.0)
+		T = rotate(T, AnisotropicRotation * M_2PI, point(0.0, 0.0, 0.0), Normal);
+
+	if (diffuse_weight > 1e-5) {
+		if (Subsurface > 1e-5) {
+			color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface);
+			BSDF = mixed_ss_base_color * bssrdf_principled(Normal, Subsurface * SubsurfaceRadius, 0.0, SubsurfaceColor, Roughness);
+		} else {
+			BSDF = BaseColor * principled_diffuse(Normal, Roughness);
+		}
+
+		if (Sheen > 1e-5) {
+			color sheen_color = color(1.0, 1.0, 1.0) * (1.0 - SheenTint) + m_ctint * SheenTint;
+
+			BSDF = BSDF + sheen_color * Sheen * principled_sheen(Normal);
+		}
+
+		BSDF = BSDF * diffuse_weight;
+	}
+
+	if (specular_weight > 1e-5) {
+		float aspect = sqrt(1.0 - Anisotropic * 0.9);
+		float r2 = Roughness * Roughness;
+
+		float alpha_x = r2 / aspect;
+		float alpha_y = r2 * aspect;
+
+		color tmp_col = color(1.0, 1.0, 1.0) * (1.0 - SpecularTint) + m_ctint * SpecularTint;
+
+		color Cspec0 = (Specular * 0.08 * tmp_col) * (1.0 - Metallic) + BaseColor * Metallic;
+
+		if (distribution == "GGX" || Roughness <= 0.075) {
+			BSDF = BSDF  + specular_weight * microfacet_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0);
+		} else {
+			BSDF = BSDF + specular_weight * microfacet_multi_ggx_aniso_fresnel(Normal, T, alpha_x, alpha_y, (2.0 / (1.0 - sqrt(0.08 * Specular))) - 1.0, BaseColor, Cspec0);
+		}
+	}
+
+	if (final_transmission > 1e-5) {
+		color Cspec0 = BaseColor * SpecularTint + color(1.0, 1.0, 1.0) * (1.0 - SpecularTint);
+		float eta = backfacing() ? 1.0 / f : f;
+
+		if (distribution == "GGX" || Roughness <= 5e-2) {
+			float cosNO = dot(Normal, I);
+			float Fr = fresnel_dielectric_cos(cosNO, eta);
+
+			float refl_roughness = Roughness;
+			if (Roughness <= 1e-2)
+				refl_roughness = 0.0;
+
+			float transmission_roughness = refl_roughness;
+			if (distribution == "GGX")
+				transmission_roughness = 1.0 - (1.0 - refl_roughness) * (1.0 - TransmissionRoughness);
+
+			BSDF = BSDF + final_transmission * (Fr * microfacet_ggx_fresnel(Normal, refl_roughness * refl_roughness, eta, BaseColor, Cspec0) +
+			       (1.0 - Fr) * BaseColor * microfacet_ggx_refraction(Normal, transmission_roughness * transmission_roughness, eta));
+		} else {
+			BSDF = BSDF + final_transmission * microfacet_multi_ggx_glass_fresnel(Normal, Roughness * Roughness, eta, BaseColor, Cspec0);
+		}
+	}
+
+	if (Clearcoat > 1e-5) {
+		BSDF = BSDF + principled_clearcoat(ClearcoatNormal, Clearcoat, ClearcoatRoughness * ClearcoatRoughness);
+	}
+}
+
diff --git a/intern/cycles/kernel/shaders/stdosl.h b/intern/cycles/kernel/shaders/stdosl.h
index a8dda8a12c9..c91d2918687 100644
--- a/intern/cycles/kernel/shaders/stdosl.h
+++ b/intern/cycles/kernel/shaders/stdosl.h
@@ -530,6 +530,11 @@ closure color microfacet_ggx_refraction(normal N, float ag, float eta) BUILTIN;
 closure color microfacet_multi_ggx(normal N, float ag, color C) BUILTIN;
 closure color microfacet_multi_ggx_aniso(normal N, vector T, float ax, float ay, color C) BUILTIN;
 closure color microfacet_multi_ggx_glass(normal N, float ag, float eta, color C) BUILTIN;
+closure color microfacet_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_aniso_fresnel(normal N, vector T, float ax, float ay, float eta, color C, color Cspec0) BUILTIN;
+closure color microfacet_multi_ggx_glass_fresnel(normal N, float ag, float eta, color C, color Cspec0) BUILTIN;
 closure color microfacet_beckmann(normal N, float ab) BUILTIN;
 closure color microfacet_beckmann_aniso(normal N, vector T, float ax, float ay) BUILTIN;
 closure color microfacet_beckmann_refraction(normal N, float ab, float eta) BUILTIN;
@@ -539,11 +544,15 @@ closure color emission() BUILTIN;
 closure color background() BUILTIN;
 closure color holdout() BUILTIN;
 closure color ambient_occlusion() BUILTIN;
+closure color principled_diffuse(normal N, float roughness) BUILTIN;
+closure color principled_sheen(normal N) BUILTIN;
+closure color principled_clearcoat(normal N, float clearcoat, float clearcoat_roughness) BUILTIN;
 
 // BSSRDF
 closure color bssrdf_cubic(normal N, vector radius, float texture_blur, float sharpness) BUILTIN;
 closure color bssrdf_gaussian(normal N, vector radius, float texture_blur) BUILTIN;
 closure color bssrdf_burley(normal N, vector radius, float texture_blur, color albedo) BUILTIN;
+closure color bssrdf_principled(normal N, vector radius, float texture_blur, color subsurface_color, float roughness) BUILTIN;
 
 // Hair
 closure color hair_reflection(normal N, float roughnessu, float roughnessv, vector T, float offset) BUILTIN;
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h
deleted file mode 100644
index 9bfa71c75ef..00000000000
--- a/intern/cycles/kernel/split/kernel_background_buffer_update.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split_common.h"
-
-/* Note on kernel_background_buffer_update kernel.
- * This is the fourth kernel in the ray tracing logic, and the third
- * of the path iteration kernels. This kernel takes care of rays that hit
- * the background (sceneintersect kernel), and for the rays of
- * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in
- * the output buffer. This kernel also takes care of rays that have been determined
- * to-be-regenerated.
- *
- * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel
- *
- * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
- * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state
- * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * The input and output are as follows,
- *
- * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop
- * throughput_coop --------------------------------------|                                      |--- L_transparent_coop
- * per_sample_output_buffers ----------------------------|                                      |--- per_sample_output_buffers
- * Ray_coop ---------------------------------------------|                                      |--- ray_state
- * PathState_coop ---------------------------------------|                                      |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * L_transparent_coop -----------------------------------|                                      |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * ray_state --------------------------------------------|                                      |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----|                                      |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                                      |--- work_array
- * parallel_samples -------------------------------------|                                      |--- PathState_coop
- * end_sample -------------------------------------------|                                      |--- throughput_coop
- * kg (globals) -----------------------------------------|                                      |--- rng_coop
- * rng_state --------------------------------------------|                                      |--- Ray
- * PathRadiance_coop ------------------------------------|                                      |
- * sw ---------------------------------------------------|                                      |
- * sh ---------------------------------------------------|                                      |
- * sx ---------------------------------------------------|                                      |
- * sy ---------------------------------------------------|                                      |
- * stride -----------------------------------------------|                                      |
- * work_array -------------------------------------------|                                      |--- work_array
- * queuesize --------------------------------------------|                                      |
- * start_sample -----------------------------------------|                                      |--- work_pool_wgs
- * work_pool_wgs ----------------------------------------|                                      |
- * num_samples ------------------------------------------|                                      |
- *
- * note on sd : sd argument is neither an input nor an output for this kernel. It is just filled and consumed here itself.
- * Note on Queues :
- * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- *
- * State of queues when this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
- */
-ccl_device char kernel_background_buffer_update(
-        KernelGlobals *kg,
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,             /* Required for buffer Update */
-        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
-        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
-        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
-        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
-        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
-        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
-        int sw, int sh, int sx, int sy, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
-        int end_sample,
-        int start_sample,
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples,                  /* Number of samples to be processed in parallel */
-        int ray_index)
-{
-	char enqueue_flag = 0;
-#ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &debugdata_coop[ray_index];
-#endif
-	ccl_global PathState *state = &PathState_coop[ray_index];
-	PathRadiance *L = L = &PathRadiance_coop[ray_index];
-	ccl_global Ray *ray = &Ray_coop[ray_index];
-	ccl_global float3 *throughput = &throughput_coop[ray_index];
-	ccl_global float *L_transparent = &L_transparent_coop[ray_index];
-	ccl_global uint *rng = &rng_coop[ray_index];
-
-#ifdef __WORK_STEALING__
-	unsigned int my_work;
-	ccl_global float *initial_per_sample_output_buffers;
-	ccl_global uint *initial_rng;
-#endif
-	unsigned int sample;
-	unsigned int tile_x;
-	unsigned int tile_y;
-	unsigned int pixel_x;
-	unsigned int pixel_y;
-	unsigned int my_sample_tile;
-
-#ifdef __WORK_STEALING__
-	my_work = work_array[ray_index];
-	sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-	get_pixel_tile_position(&pixel_x, &pixel_y,
-	                        &tile_x, &tile_y,
-	                        my_work,
-	                        sw, sh, sx, sy,
-	                        parallel_samples,
-	                        ray_index);
-	my_sample_tile = 0;
-	initial_per_sample_output_buffers = per_sample_output_buffers;
-	initial_rng = rng_state;
-#else  /* __WORK_STEALING__ */
-	sample = work_array[ray_index];
-	int tile_index = ray_index / parallel_samples;
-	/* buffer and rng_state's stride is "stride". Find x and y using ray_index */
-	tile_x = tile_index % sw;
-	tile_y = tile_index / sw;
-	my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif  /* __WORK_STEALING__ */
-
-	rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-	per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-
-	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-		/* eval background shader if nothing hit */
-		if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
-			*L_transparent = (*L_transparent) + average((*throughput));
-#ifdef __PASSES__
-			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-		}
-
-		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, kg->sd_input, state, ray);
-			path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
-#endif
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-		}
-	}
-
-	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-		float3 L_sum = path_radiance_clamp_and_sum(kg, L);
-		kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
-#ifdef __KERNEL_DEBUG__
-		kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
-#endif
-		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
-
-		/* accumulate result in output buffer */
-		kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
-		path_rng_end(kg, rng_state, *rng);
-
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-	}
-
-	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
-		/* We have completed current work; So get next work */
-		int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
-		if(!valid_work) {
-			/* If work is invalid, this means no more work is available and the thread may exit */
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-		}
-#else  /* __WORK_STEALING__ */
-		if((sample + parallel_samples) >= end_sample) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-		}
-#endif  /* __WORK_STEALING__ */
-
-		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
-			work_array[ray_index] = my_work;
-			/* Get the sample associated with the current work */
-			sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-			/* Get pixel and tile position associated with current work */
-			get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
-			my_sample_tile = 0;
-
-			/* Remap rng_state according to the current work */
-			rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
-			/* Remap per_sample_output_buffers according to the current work */
-			per_sample_output_buffers = initial_per_sample_output_buffers
-				+ (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-#else  /* __WORK_STEALING__ */
-			work_array[ray_index] = sample + parallel_samples;
-			sample = work_array[ray_index];
-
-			/* Get ray position from ray index */
-			pixel_x = sx + ((ray_index / parallel_samples) % sw);
-			pixel_y = sy + ((ray_index / parallel_samples) / sw);
-#endif  /* __WORK_STEALING__ */
-
-			/* Initialize random numbers and ray. */
-			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
-
-			if(ray->t != 0.0f) {
-				/* Initialize throughput, L_transparent, Ray, PathState;
-				 * These rays proceed with path-iteration.
-				 */
-				*throughput = make_float3(1.0f, 1.0f, 1.0f);
-				*L_transparent = 0.0f;
-				path_radiance_init(L, kernel_data.film.use_light_pass);
-				path_state_init(kg, kg->sd_input, state, rng, sample, ray);
-#ifdef __KERNEL_DEBUG__
-				debug_data_init(debug_data);
-#endif
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-				enqueue_flag = 1;
-			}
-			else {
-				/* These rays do not participate in path-iteration. */
-				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				/* Accumulate result in output buffer. */
-				kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
-				path_rng_end(kg, rng_state, *rng);
-
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-			}
-		}
-	}
-	return enqueue_flag;
-}
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
new file mode 100644
index 00000000000..2313feac089
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -0,0 +1,219 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __BRANCHED_PATH__
+
+/* sets up the various state needed to do an indirect loop */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	/* save a copy of the state to restore later */
+#define BRANCHED_STORE(name) \
+		branched_state->name = kernel_split_state.name[ray_index];
+
+	BRANCHED_STORE(path_state);
+	BRANCHED_STORE(throughput);
+	BRANCHED_STORE(ray);
+	BRANCHED_STORE(sd);
+	BRANCHED_STORE(isect);
+	BRANCHED_STORE(ray_state);
+
+#undef BRANCHED_STORE
+
+	/* set loop counters to intial position */
+	branched_state->next_closure = 0;
+	branched_state->next_sample = 0;
+}
+
+/* ends an indirect loop and restores the previous state */
+ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	/* restore state */
+#define BRANCHED_RESTORE(name) \
+		kernel_split_state.name[ray_index] = branched_state->name;
+
+	BRANCHED_RESTORE(path_state);
+	BRANCHED_RESTORE(throughput);
+	BRANCHED_RESTORE(ray);
+	BRANCHED_RESTORE(sd);
+	BRANCHED_RESTORE(isect);
+	BRANCHED_RESTORE(ray_state);
+
+#undef BRANCHED_RESTORE
+
+	/* leave indirect loop */
+	REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
+}
+
+ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg, int ray_index)
+{
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+	int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS,
+		kernel_split_state.queue_data, kernel_split_params.queue_size, kernel_split_params.queue_index);
+
+	if(!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) {
+		return false;
+	}
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+		kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index];
+	SPLIT_DATA_ENTRIES_BRANCHED_SHARED
+#undef SPLIT_DATA_ENTRY
+
+	kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0;
+	kernel_split_state.branched_state[inactive_ray].original_ray = ray_index;
+	kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false;
+
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
+
+	path_radiance_init(inactive_L, kernel_data.film.use_light_pass);
+	path_radiance_copy_indirect(inactive_L, L);
+
+	ray_state[inactive_ray] = RAY_REGENERATED;
+	ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED);
+	ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT));
+
+	atomic_fetch_and_inc_uint32((ccl_global uint*)&kernel_split_state.branched_state[ray_index].shared_sample_count);
+
+	return true;
+}
+
+/* bounce off surface and integrate indirect light */
+ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg,
+                                                                                int ray_index,
+                                                                                float num_samples_adjust,
+                                                                                ShaderData *saved_sd,
+                                                                                bool reset_path_state,
+                                                                                bool wait_for_shared)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = saved_sd;
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	float3 throughput = branched_state->throughput;
+	ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+
+	float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+	if(ps->denoising_feature_weight > 0.0f) {
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			/* transparency is not handled here, but in outer loop */
+			if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+				continue;
+			}
+
+			sum_sample_weight += sc->sample_weight;
+		}
+	}
+	else {
+		sum_sample_weight = 1.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
+	for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSDF(sc->type))
+			continue;
+		/* transparency is not handled here, but in outer loop */
+		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+			continue;
+
+		int num_samples;
+
+		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
+			num_samples = kernel_data.integrator.diffuse_samples;
+		else if(CLOSURE_IS_BSDF_BSSRDF(sc->type))
+			num_samples = 1;
+		else if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
+			num_samples = kernel_data.integrator.glossy_samples;
+		else
+			num_samples = kernel_data.integrator.transmission_samples;
+
+		num_samples = ceil_to_int(num_samples_adjust*num_samples);
+
+		float num_samples_inv = num_samples_adjust/num_samples;
+
+		for(int j = branched_state->next_sample; j < num_samples; j++) {
+			if(reset_path_state) {
+				*ps = branched_state->path_state;
+			}
+
+			ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
+
+			ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+			*tp = throughput;
+
+			ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
+
+			if(!kernel_branched_path_surface_bounce(kg,
+			                                        sd,
+			                                        sc,
+			                                        j,
+			                                        num_samples,
+			                                        tp,
+			                                        ps,
+			                                        &L->state,
+			                                        bsdf_ray,
+			                                        sum_sample_weight))
+			{
+				continue;
+			}
+
+			ps->rng_hash = branched_state->path_state.rng_hash;
+
+			/* update state for next iteration */
+			branched_state->next_closure = i;
+			branched_state->next_sample = j+1;
+
+			/* start the indirect path */
+			*tp *= num_samples_inv;
+
+			if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+				continue;
+			}
+
+			return true;
+		}
+
+		branched_state->next_sample = 0;
+	}
+
+	branched_state->next_closure = sd->num_closure;
+
+	if(wait_for_shared) {
+		branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+		if(branched_state->waiting_on_shared_samples) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ */
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
new file mode 100644
index 00000000000..511334e0550
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel takes care of rays that hit the background (sceneintersect
+ * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's
+ * accumulated radiance in the output buffer. This kernel also takes care of
+ * rays that have been determined to-be-regenerated.
+ *
+ * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel.
+ *
+ * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
+ * will be eventually set to RAY_TO_REGENERATE state in this kernel.
+ * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put
+ * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * State of queues when this kernel is called:
+ * At entry,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays.
+ * At exit,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
+ *     RAY_REGENERATED rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ */
+ccl_device void kernel_buffer_update(KernelGlobals *kg,
+                                     ccl_local_param unsigned int *local_queue_atomics)
+{
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(ray_index == 0) {
+		/* We will empty this queue in this kernel. */
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+	char enqueue_flag = 0;
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+
+	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		uint sample = state->sample;
+		uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+		ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+
+		/* accumulate result in output buffer */
+		kernel_write_result(kg, buffer, sample, L);
+
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+		/* We have completed current work; So get next work */
+		ccl_global uint *work_pools = kernel_split_params.work_pools;
+		uint total_work_size = kernel_split_params.total_work_size;
+		uint work_index;
+
+		if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
+			/* If work is invalid, this means no more work is available and the thread may exit */
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+			ccl_global WorkTile *tile = &kernel_split_params.tile;
+			uint x, y, sample;
+			get_work_pixel(tile, work_index, &x, &y, &sample);
+
+			/* Store buffer offset for writing to passes. */
+			uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride;
+			kernel_split_state.buffer_offset[ray_index] = buffer_offset;
+
+			/* Initialize random numbers and ray. */
+			uint rng_hash;
+			kernel_path_trace_setup(kg, sample, x, y, &rng_hash, ray);
+
+			if(ray->t != 0.0f) {
+				/* Initialize throughput, path radiance, Ray, PathState;
+				 * These rays proceed with path-iteration.
+				 */
+				*throughput = make_float3(1.0f, 1.0f, 1.0f);
+				path_radiance_init(L, kernel_data.film.use_light_pass);
+				path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, rng_hash, sample, ray);
+#ifdef __SUBSURFACE__
+				kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				enqueue_flag = 1;
+			}
+			else {
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+			}
+		}
+	}
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	 * These rays will be made active during next SceneIntersectkernel.
+	 */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 6e158d53d23..77fb61b80a8 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -14,221 +14,96 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_data_initialization kernel
- * This kernel Initializes structures needed in path-iteration kernels.
- * This is the first kernel in ray-tracing logic.
+/* This kernel Initializes structures needed in path-iteration kernels.
  *
- * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
- *
- * Its input and output are as follows,
- *
- * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng
- * Un-initialized throughput -------|                                  |--- Initialized throughput
- * Un-initialized L_transparent ----|                                  |--- Initialized L_transparent
- * Un-initialized PathRadiance -----|                                  |--- Initialized PathRadiance
- * Un-initialized Ray --------------|                                  |--- Initialized Ray
- * Un-initialized PathState --------|                                  |--- Initialized PathState
- * Un-initialized QueueData --------|                                  |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
- * Un-initialized QueueIndex -------|                                  |--- Initialized QueueIndex (to 0)
- * Un-initialized use_queues_flag---|                                  |--- Initialized use_queues_flag (to false)
- * Un-initialized ray_state --------|                                  |--- Initialized ray_state
- * parallel_samples --------------- |                                  |--- Initialized per_sample_output_buffers
- * rng_state -----------------------|                                  |--- Initialized work_array
- * data ----------------------------|                                  |--- Initialized work_pool_wgs
- * start_sample --------------------|                                  |
- * sx ------------------------------|                                  |
- * sy ------------------------------|                                  |
- * sw ------------------------------|                                  |
- * sh ------------------------------|                                  |
- * stride --------------------------|                                  |
- * queuesize -----------------------|                                  |
- * num_samples ---------------------|                                  |
- *
- * Note on Queues :
+ * Note on Queues:
  * All slots in queues are initialized to queue empty slot;
  * The number of elements in the queues is initialized to 0;
  */
+
+#ifndef __KERNEL_CPU__
 ccl_device void kernel_data_init(
+#else
+void KERNEL_FUNCTION_FULL_NAME(data_init)(
+#endif
         KernelGlobals *kg,
-        ShaderData *sd_DL_shadow,
         ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
-        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
-        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
-        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
-        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
-        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
-        Intersection *Intersection_coop_shadow,
-        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
-
-#define KERNEL_TEX(type, ttype, name)                                   \
-        ccl_global type *name,
-#include "../kernel_textures.h"
-
-        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global int *Queue_data,                  /* Memory for queues */
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+
+#ifdef __KERNEL_OPENCL__
+		KERNEL_BUFFER_PARAMS,
+#endif
+
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
         ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
         int queuesize,                               /* size (capacity) of the queue */
         ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
-        unsigned int num_samples,                    /* Total number of samples per pixel */
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                        /* Number of samples to be processed in parallel */
+        ccl_global unsigned int *work_pools,      /* Work pool for each work group */
+        unsigned int num_samples,
+        ccl_global float *buffer)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, data_init);
+#else
+
+#ifdef __KERNEL_OPENCL__
 	kg->data = data;
-	kg->sd_input = sd_DL_shadow;
-	kg->isect_shadow = Intersection_coop_shadow;
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "../kernel_textures.h"
-
-	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-
-#ifdef __WORK_STEALING__
-	int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
-	/* Initialize work_pool_wgs */
-	if(lid == 0) {
-		int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
-		work_pool_wgs[group_index] = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-#endif  /* __WORK_STEALING__ */
+#endif
 
-	/* Initialize queue data and queue index. */
-	if(thread_index < queuesize) {
-		/* Initialize active ray queue. */
-		Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize background and buffer update queue. */
-		Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize shadow ray cast of AO queue. */
-		Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-		/* Initialize shadow ray cast of direct lighting queue. */
-		Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-	}
+	kernel_split_params.tile.x = sx;
+	kernel_split_params.tile.y = sy;
+	kernel_split_params.tile.w = sw;
+	kernel_split_params.tile.h = sh;
 
-	if(thread_index == 0) {
-		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
-		/* The scene-intersect kernel should not use the queues very first time.
-		 * since the queue would be empty.
-		 */
-		use_queues_flag[0] = 0;
-	}
+	kernel_split_params.tile.start_sample = start_sample;
+	kernel_split_params.tile.num_samples = num_samples;
 
-	int x = get_global_id(0);
-	int y = get_global_id(1);
+	kernel_split_params.tile.offset = offset;
+	kernel_split_params.tile.stride = stride;
 
-	if(x < (sw * parallel_samples) && y < sh) {
-		int ray_index = x + y * (sw * parallel_samples);
+	kernel_split_params.tile.buffer = buffer;
 
-		/* This is the first assignment to ray_state;
-		 * So we dont use ASSIGN_RAY_STATE macro.
-		 */
-		ray_state[ray_index] = RAY_ACTIVE;
-
-		unsigned int my_sample;
-		unsigned int pixel_x;
-		unsigned int pixel_y;
-		unsigned int tile_x;
-		unsigned int tile_y;
-		unsigned int my_sample_tile;
-
-#ifdef __WORK_STEALING__
-		unsigned int my_work = 0;
-		/* Get work. */
-		get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
-		/* Get the sample associated with the work. */
-		my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-
-		my_sample_tile = 0;
-
-		/* Get pixel and tile position associated with the work. */
-		get_pixel_tile_position(&pixel_x, &pixel_y,
-		                        &tile_x, &tile_y,
-		                        my_work,
-		                        sw, sh, sx, sy,
-		                        parallel_samples,
-		                        ray_index);
-		work_array[ray_index] = my_work;
-#else  /* __WORK_STEALING__ */
-		unsigned int tile_index = ray_index / parallel_samples;
-		tile_x = tile_index % sw;
-		tile_y = tile_index / sw;
-		my_sample_tile = ray_index - (tile_index * parallel_samples);
-		my_sample = my_sample_tile + start_sample;
-
-		/* Initialize work array. */
-		work_array[ray_index] = my_sample ;
-
-		/* Calculate pixel position of this ray. */
-		pixel_x = sx + tile_x;
-		pixel_y = sy + tile_y;
-#endif  /* __WORK_STEALING__ */
-
-		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-
-		/* Initialise per_sample_output_buffers to all zeros. */
-		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
-		int per_sample_output_buffers_iterator = 0;
-		for(per_sample_output_buffers_iterator = 0;
-		    per_sample_output_buffers_iterator < kernel_data.film.pass_stride;
-		    per_sample_output_buffers_iterator++)
-		{
-			per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
-		}
+	kernel_split_params.total_work_size = sw * sh * num_samples;
+
+	kernel_split_params.work_pools = work_pools;
 
-		/* Initialize random numbers and ray. */
-		kernel_path_trace_setup(kg,
-		                        rng_state,
-		                        my_sample,
-		                        pixel_x, pixel_y,
-		                        &rng_coop[ray_index],
-		                        &Ray_coop[ray_index]);
-
-		if(Ray_coop[ray_index].t != 0.0f) {
-			/* Initialize throughput, L_transparent, Ray, PathState;
-			 * These rays proceed with path-iteration.
-			 */
-			throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
-			L_transparent_coop[ray_index] = 0.0f;
-			path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
-			path_state_init(kg,
-			                kg->sd_input,
-			                &PathState_coop[ray_index],
-			                &rng_coop[ray_index],
-			                my_sample,
-			                &Ray_coop[ray_index]);
-#ifdef __KERNEL_DEBUG__
-			debug_data_init(&debugdata_coop[ray_index]);
+	kernel_split_params.queue_index = Queue_index;
+	kernel_split_params.queue_size = queuesize;
+	kernel_split_params.use_queues_flag = use_queues_flag;
+
+	split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
+
+#ifdef __KERNEL_OPENCL__
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 #endif
-		}
-		else {
-			/* These rays do not participate in path-iteration. */
-			float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			/* Accumulate result in output buffer. */
-			kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
-			path_rng_end(kg, rng_state, rng_coop[ray_index]);
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+	/* Initialize queue data and queue index. */
+	if(thread_index < queuesize) {
+		for(int i = 0; i < NUM_QUEUES; i++) {
+			kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
 		}
 	}
 
-	/* Mark rest of the ray-state indices as RAY_INACTIVE. */
-	if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
-		/* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
-		ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
+	if(thread_index == 0) {
+		for(int i = 0; i < NUM_QUEUES; i++) {
+			Queue_index[i] = 0;
+		}
+
+		/* The scene-intersect kernel should not use the queues very first time.
+		 * since the queue would be empty.
+		 */
+		*use_queues_flag = 0;
 	}
+#endif  /* KERENL_STUB */
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index 82ca18829d3..2aac66ecb84 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -14,95 +14,136 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_direct_lighting kernel.
- * This is the eighth kernel in the ray tracing logic. This is the seventh
- * of the path iteration kernels. This kernel takes care of direct lighting
- * logic. However, the "shadow ray cast" part of direct lighting is handled
+/* This kernel takes care of direct lighting logic.
+ * However, the "shadow ray cast" part of direct lighting is handled
  * in the next kernel.
  *
- * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed.
- * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and
- * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS
+ * This kernels determines the rays for which a shadow_blocked() function
+ * associated with direct lighting should be executed. Those rays for which
+ * a shadow_blocked() function for direct-lighting must be executed, are
+ * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS
  *
- * The input and output are as follows,
+ * Note on Queues:
+ * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue
+ * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute
+ * the corresponding shadow_blocked part, after direct lighting, the ray is
+ * marked with RAY_SHADOW_RAY_CAST_DL flag.
  *
- * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop
- * PathState_coop -----------------------------------|                             |--- ISLamp_coop
- * sd -----------------------------------------------|                             |--- LightRay_coop
- * ray_state ----------------------------------------|                             |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                             |
- * kg (globals) -------------------------------------|                             |
- * queuesize ----------------------------------------|                             |
- *
- * Note on Queues :
- * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
- * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked
- * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag.
- *
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
- * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
+ * State of queues when this kernel is called:
+ * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and
+ *   QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this
+ *   kernel call.
+ * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a
+ *   shadow_blocked function must be executed, after this kernel call
+ *    Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
  */
-ccl_device char kernel_direct_lighting(
-        KernelGlobals *kg,
-        ShaderData *sd,                         /* Required for direct lighting */
-        ccl_global uint *rng_coop,              /* Required for direct lighting */
-        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
-        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
-        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
-        ccl_global char *ray_state,             /* Denotes the state of each ray */
-        int ray_index)
+ccl_device void kernel_direct_lighting(KernelGlobals *kg,
+                                       ccl_local_param unsigned int *local_queue_atomics)
 {
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
 	char enqueue_flag = 0;
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		ccl_global PathState *state = &PathState_coop[ray_index];
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		ShaderData *sd = &kernel_split_state.sd[ray_index];
 
 		/* direct lighting */
 #ifdef __EMISSION__
-		if((kernel_data.integrator.use_direct_light &&
-		    (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
-		{
+		bool flag = (kernel_data.integrator.use_direct_light &&
+		             (sd->flag & SD_BSDF_HAS_EVAL));
+
+#  ifdef __BRANCHED_PATH__
+		if(flag && kernel_data.integrator.branched) {
+			flag = false;
+			enqueue_flag = 1;
+		}
+#  endif  /* __BRANCHED_PATH__ */
+
+#  ifdef __SHADOW_TRICKS__
+		if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
+			flag = false;
+			enqueue_flag = 1;
+		}
+#  endif  /* __SHADOW_TRICKS__ */
+
+		if(flag) {
 			/* Sample illumination from lights to find path contribution. */
-			ccl_global RNG* rng = &rng_coop[ray_index];
-			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 			float light_u, light_v;
-			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-			float terminate = path_state_rng_light_termination(kg, rng, state);
+			path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
+			float terminate = path_state_rng_light_termination(kg, state);
 
 			LightSample ls;
 			if(light_sample(kg,
-			                light_t, light_u, light_v,
-			                ccl_fetch(sd, time),
-			                ccl_fetch(sd, P),
+			                light_u, light_v,
+			                sd->time,
+			                sd->P,
 			                state->bounce,
 			                &ls)) {
 
 				Ray light_ray;
-#ifdef __OBJECT_MOTION__
-				light_ray.time = ccl_fetch(sd, time);
-#endif
+				light_ray.time = sd->time;
 
 				BsdfEval L_light;
 				bool is_lamp;
-				if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+				if(direct_emission(kg, sd, &kernel_split_state.sd_DL_shadow[ray_index], &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 					/* Write intermediate data to global memory to access from
 					 * the next kernel.
 					 */
-					LightRay_coop[ray_index] = light_ray;
-					BSDFEval_coop[ray_index] = L_light;
-					ISLamp_coop[ray_index] = is_lamp;
+					kernel_split_state.light_ray[ray_index] = light_ray;
+					kernel_split_state.bsdf_eval[ray_index] = L_light;
+					kernel_split_state.is_lamp[ray_index] = is_lamp;
 					/* Mark ray state for next shadow kernel. */
-					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 					enqueue_flag = 1;
 				}
 			}
 		}
 #endif  /* __EMISSION__ */
 	}
-	return enqueue_flag;
+
+#ifdef __EMISSION__
+	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#endif
+
+#ifdef __BRANCHED_PATH__
+	/* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
+	 * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
+	 */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_LIGHT_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#endif  /* __BRANCHED_PATH__ */
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
new file mode 100644
index 00000000000..491487f1230
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
+
+ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+	/* GPU: no decoupled ray marching, scatter probalistically */
+	int num_samples = kernel_data.integrator.volume_samples;
+	float num_samples_inv = 1.0f/num_samples;
+
+	Ray volume_ray = branched_state->ray;
+	volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ? branched_state->isect.t : FLT_MAX;
+
+	bool heterogeneous = volume_stack_is_heterogeneous(kg, branched_state->path_state.volume_stack);
+
+	for(int j = branched_state->next_sample; j < num_samples; j++) {
+		ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+		*ps = branched_state->path_state;
+
+		ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
+		*pray = branched_state->ray;
+
+		ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
+		*tp = branched_state->throughput * num_samples_inv;
+
+		/* branch RNG state */
+		path_state_branch(ps, j, num_samples);
+
+		/* integrate along volume segment with distance sampling */
+		VolumeIntegrateResult result = kernel_volume_integrate(
+			kg, ps, sd, &volume_ray, L, tp, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+		if(result == VOLUME_PATH_SCATTERED) {
+			/* direct lighting */
+			kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L);
+
+			/* indirect light bounce */
+			if(!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) {
+				continue;
+			}
+
+			/* start the indirect path */
+			branched_state->next_closure = 0;
+			branched_state->next_sample = j+1;
+
+			/* Attempting to share too many samples is slow for volumes as it causes us to
+			 * loop here more and have many calls to kernel_volume_integrate which evaluates
+			 * shaders. The many expensive shader evaluations cause the work load to become
+			 * unbalanced and many threads to become idle in this kernel. Limiting the
+			 * number of shared samples here helps quite a lot.
+			 */
+			if(branched_state->shared_sample_count < 2) {
+				if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+					continue;
+				}
+			}
+
+			return true;
+		}
+#  endif
+	}
+
+	branched_state->next_sample = num_samples;
+
+	branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+	if(branched_state->waiting_on_shared_samples) {
+		return true;
+	}
+
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	/* todo: avoid this calculation using decoupled ray marching */
+	float3 throughput = kernel_split_state.throughput[ray_index];
+	kernel_volume_shadow(kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
+	kernel_split_state.throughput[ray_index] = throughput;
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __VOLUME__ */
+
+ccl_device void kernel_do_volume(KernelGlobals *kg)
+{
+#ifdef __VOLUME__
+	/* We will empty this queue in this kernel. */
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+#  ifdef __BRANCHED_PATH__
+		kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
+#  endif  /* __BRANCHED_PATH__ */
+	}
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+	if(*kernel_split_params.use_queues_flag) {
+		ray_index = get_ray_index(kg, ray_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          1);
+	}
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+		ShaderData *sd = &kernel_split_state.sd[ray_index];
+		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+		bool hit = ! IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+
+		/* Sanitize volume stack. */
+		if(!hit) {
+			kernel_volume_clean_stack(kg, state->volume_stack);
+		}
+		/* volume attenuation, emission, scatter */
+		if(state->volume_stack[0].shader != SHADER_NONE) {
+			Ray volume_ray = *ray;
+			volume_ray.t = (hit)? isect->t: FLT_MAX;
+
+#  ifdef __BRANCHED_PATH__
+			if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#  endif  /* __BRANCHED_PATH__ */
+				bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+
+				{
+					/* integrate along volume segment with distance sampling */
+					VolumeIntegrateResult result = kernel_volume_integrate(
+						kg, state, sd, &volume_ray, L, throughput, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+					if(result == VOLUME_PATH_SCATTERED) {
+						/* direct lighting */
+						kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
+
+						/* indirect light bounce */
+						if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) {
+							ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+						}
+						else {
+							kernel_split_path_end(kg, ray_index);
+						}
+					}
+#  endif  /* __VOLUME_SCATTER__ */
+				}
+
+#  ifdef __BRANCHED_PATH__
+			}
+			else {
+				kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
+
+				if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				}
+			}
+#  endif  /* __BRANCHED_PATH__ */
+		}
+	}
+
+#  ifdef __BRANCHED_PATH__
+	/* iter loop */
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_VOLUME_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+		path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+		if(kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+	}
+#  endif  /* __BRANCHED_PATH__ */
+
+#endif  /* __VOLUME__ */
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
new file mode 100644
index 00000000000..496355bbc3a
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_enqueue_inactive(KernelGlobals *kg,
+                                        ccl_local_param unsigned int *local_queue_atomics)
+{
+#ifdef __BRANCHED_PATH__
+	/* Enqeueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+	char enqueue_flag = 0;
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) {
+		enqueue_flag = 1;
+	}
+
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_INACTIVE_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#endif  /* __BRANCHED_PATH__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 435d1171d5c..906bad8ceb6 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -14,247 +14,161 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
- * This is the sixth kernel in the ray tracing logic. This is the fifth
- * of the path iteration kernels. This kernel takes care of the logic to process
- * "material of type holdout", indirect primitive emission, bsdf blurring,
- * probabilistic path termination and AO.
+/* This kernel takes care of the logic to process "material of type holdout",
+ * indirect primitive emission, bsdf blurring, probabilistic path termination
+ * and AO.
  *
- * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed.
- * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and
- * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS
+ * This kernels determines the rays for which a shadow_blocked() function
+ * associated with AO should be executed. Those rays for which a
+ * shadow_blocked() function for AO must be executed are marked with flag
+ * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS
  *
  * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
  *
- * The input and output are as follows,
+ * Note on Queues:
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
+ * and processes only the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and
+ * reach RAY_UPDATE_BUFFER state. These rays are enqueued into
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
+ * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
+ * been changed to RAY_UPDATE_BUFFER, there is no problem.
  *
- * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * throughput_coop --------------------------------------|                                                           |--- PathState_coop
- * PathRadiance_coop ------------------------------------|                                                           |--- throughput_coop
- * Intersection_coop ------------------------------------|                                                           |--- L_transparent_coop
- * PathState_coop ---------------------------------------|                                                           |--- per_sample_output_buffers
- * L_transparent_coop -----------------------------------|                                                           |--- PathRadiance_coop
- * sd ---------------------------------------------------|                                                           |--- ShaderData
- * ray_state --------------------------------------------|                                                           |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------|                                                           |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                           |--- AOAlpha_coop
- * kg (globals) -----------------------------------------|                                                           |--- AOBSDF_coop
- * parallel_samples -------------------------------------|                                                           |--- AOLightRay_coop
- * per_sample_output_buffers ----------------------------|                                                           |
- * sw ---------------------------------------------------|                                                           |
- * sh ---------------------------------------------------|                                                           |
- * sx ---------------------------------------------------|                                                           |
- * sy ---------------------------------------------------|                                                           |
- * stride -----------------------------------------------|                                                           |
- * work_array -------------------------------------------|                                                           |
- * queuesize --------------------------------------------|                                                           |
- * start_sample -----------------------------------------|                                                           |
- *
- * Note on Queues :
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
- * the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER
- * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
- * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
- * changed to RAY_UPDATE_BUFFER, there is no problem.
- *
- * State of queues when this kernel is called :
+ * State of queues when this kernel is called:
  * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
+ *     RAY_REGENERATED rays
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE rays.
+ *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
  * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED and RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
+ *     flag RAY_SHADOW_RAY_CAST_AO
  */
+
 ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
         KernelGlobals *kg,
-        ShaderData *sd,                        /* Required throughout the kernel except probabilistic path termination and AO */
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
-        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
-        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
-        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
-        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
-        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
-        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
-        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
-        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
-        int sw, int sh, int sx, int sy, int stride,
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
-#ifdef __WORK_STEALING__
-        unsigned int start_sample,
-#endif
-        int parallel_samples,                  /* Number of samples to be processed in parallel */
-        int ray_index,
-        char *enqueue_flag,
-        char *enqueue_flag_AO_SHADOW_RAY_CAST)
+        ccl_local_param BackgroundAOLocals *locals)
 {
-#ifdef __WORK_STEALING__
-	unsigned int my_work;
-	unsigned int pixel_x;
-	unsigned int pixel_y;
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		locals->queue_atomics_bg = 0;
+		locals->queue_atomics_ao = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+#ifdef __AO__
+	char enqueue_flag = 0;
+#endif
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif  /* __COMPUTE_DEVICE_GPU__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
 #endif
-	unsigned int tile_x;
-	unsigned int tile_y;
-	int my_sample_tile;
-	unsigned int sample;
 
-	ccl_global RNG *rng = 0x0;
 	ccl_global PathState *state = 0x0;
 	float3 throughput;
 
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
 
-		throughput = throughput_coop[ray_index];
-		state = &PathState_coop[ray_index];
-		rng = &rng_coop[ray_index];
-#ifdef __WORK_STEALING__
-		my_work = work_array[ray_index];
-		sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-		get_pixel_tile_position(&pixel_x, &pixel_y,
-		                        &tile_x, &tile_y,
-		                        my_work,
-		                        sw, sh, sx, sy,
-		                        parallel_samples,
-		                        ray_index);
-		my_sample_tile = 0;
-#else  /* __WORK_STEALING__ */
-		sample = work_array[ray_index];
-		/* Buffer's stride is "stride"; Find x and y using ray_index. */
-		int tile_index = ray_index / parallel_samples;
-		tile_x = tile_index % sw;
-		tile_y = tile_index / sw;
-		my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif  /* __WORK_STEALING__ */
-		per_sample_output_buffers +=
-		    (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) *
-		    kernel_data.film.pass_stride;
-
-		/* holdout */
-#ifdef __HOLDOUT__
-		if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) &&
-		   (state->flag & PATH_RAY_CAMERA))
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+		ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+		throughput = kernel_split_state.throughput[ray_index];
+		state = &kernel_split_state.path_state[ray_index];
+
+		if(!kernel_path_shader_apply(kg,
+		                             sd,
+		                             state,
+		                             ray,
+		                             throughput,
+		                             emission_sd,
+		                             L,
+		                             buffer))
 		{
-			if(kernel_data.background.transparent) {
-				float3 holdout_weight;
-
-				if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK)
-					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-				else
-					holdout_weight = shader_holdout_eval(kg, sd);
-
-				/* any throughput is ok, should all be identical here */
-				L_transparent_coop[ray_index] += average(holdout_weight*throughput);
-			}
-
-			if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) {
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				*enqueue_flag = 1;
-			}
+			kernel_split_path_end(kg, ray_index);
 		}
-#endif  /* __HOLDOUT__ */
 	}
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		PathRadiance *L = &PathRadiance_coop[ray_index];
-		/* Holdout mask objects do not write data passes. */
-		kernel_write_data_passes(kg,
-		                         per_sample_output_buffers,
-		                         L,
-		                         sd,
-		                         sample,
-		                         state,
-		                         throughput);
-		/* Blurring of bsdf after bounces, for rays that have a small likelihood
-		 * of following this particular path (diffuse, rough glossy.
-		 */
-		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
-			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
-			if(blur_pdf < 1.0f) {
-				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-				shader_bsdf_blur(kg, sd, blur_roughness);
-			}
-		}
-
-#ifdef __EMISSION__
-		/* emission */
-		if(ccl_fetch(sd, flag) & SD_EMISSION) {
-			/* TODO(sergey): is isect.t wrong here for transparent surfaces? */
-			float3 emission = indirect_primitive_emission(
-			        kg,
-			        sd,
-			        Intersection_coop[ray_index].t,
-			        state->flag,
-			        state->ray_pdf);
-			path_radiance_accum_emission(L, throughput, emission, state->bounce);
-		}
-#endif  /* __EMISSION__ */
-
 		/* Path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate.
 		 */
-		float probability = path_state_terminate_probability(kg, state, throughput);
+		float probability = path_state_continuation_probability(kg, state, throughput);
 
 		if(probability == 0.0f) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			*enqueue_flag = 1;
+			kernel_split_path_end(kg, ray_index);
+		}
+		else if(probability < 1.0f) {
+			float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
+			if(terminate >= probability) {
+				kernel_split_path_end(kg, ray_index);
+			}
+			else {
+				kernel_split_state.throughput[ray_index] = throughput/probability;
+			}
 		}
 
 		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-			if(probability != 1.0f) {
-				float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
-				if(terminate >= probability) {
-					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-					*enqueue_flag = 1;
-				}
-				else {
-					throughput_coop[ray_index] = throughput/probability;
-				}
-			}
+			PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+			kernel_update_denoising_features(kg, sd, state, L);
 		}
 	}
 
 #ifdef __AO__
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		/* ambient occlusion */
-		if(kernel_data.integrator.use_ambient_occlusion ||
-		   (ccl_fetch(sd, flag) & SD_AO))
-		{
-			/* todo: solve correlation */
-			float bsdf_u, bsdf_v;
-			path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-			float ao_factor = kernel_data.background.ao_factor;
-			float3 ao_N;
-			AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-			AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
-
-			float3 ao_D;
-			float ao_pdf;
-			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-			if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
-				Ray _ray;
-				_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
-				_ray.D = ao_D;
-				_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-				_ray.time = ccl_fetch(sd, time);
-#endif
-				_ray.dP = ccl_fetch(sd, dP);
-				_ray.dD = differential3_zero();
-				AOLightRay_coop[ray_index] = _ray;
-
-				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-				*enqueue_flag_AO_SHADOW_RAY_CAST = 1;
-			}
+		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
+			enqueue_flag = 1;
 		}
 	}
 #endif  /* __AO__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+#ifdef __AO__
+	/* Enqueue to-shadow-ray-cast rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        &locals->queue_atomics_ao,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#endif
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
new file mode 100644
index 00000000000..437043a5971
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_indirect_background(KernelGlobals *kg)
+{
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	int ray_index;
+
+	if(kernel_data.integrator.ao_bounces != INT_MAX) {
+		ray_index = get_ray_index(kg, thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          0);
+
+		if(ray_index != QUEUE_EMPTY_SLOT) {
+			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+				ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+				if(path_state_ao_bounce(kg, state)) {
+					kernel_split_path_end(kg, ray_index);
+				}
+			}
+		}
+	}
+
+	ray_index = get_ray_index(kg, thread_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		float3 throughput = kernel_split_state.throughput[ray_index];
+		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+		kernel_path_background(kg, state, ray, throughput, emission_sd, L);
+		kernel_split_path_end(kg, ray_index);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
new file mode 100644
index 00000000000..e9fe5552e8c
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
+{
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index == 0) {
+		/* We will empty both queues in this kernel. */
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+
+	int ray_index;
+	get_ray_index(kg, thread_index,
+	              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	              kernel_split_state.queue_data,
+	              kernel_split_params.queue_size,
+	              1);
+	ray_index = get_ray_index(kg, thread_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+#ifdef __SUBSURFACE__
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+
+#ifdef __BRANCHED_PATH__
+	if(!kernel_data.integrator.branched) {
+#endif
+		if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+			ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+
+			/* Trace indirect subsurface rays by restarting the loop. this uses less
+			 * stack memory than invoking kernel_path_indirect.
+			 */
+			if(ss_indirect->num_rays) {
+				kernel_path_subsurface_setup_indirect(kg,
+					                                  ss_indirect,
+					                                  state,
+					                                  ray,
+					                                  L,
+					                                  throughput);
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+			}
+		}
+#ifdef __BRANCHED_PATH__
+	}
+#endif
+
+#endif  /* __SUBSURFACE__ */
+
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
index 3bd0e361078..448456d167d 100644
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -14,70 +14,55 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_lamp_emission
- * This is the 3rd kernel in the ray-tracing logic. This is the second of the
- * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
- * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
- * and RAY_HIT_BACKGROUND.
+/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND.
  * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
- * The input/output of the kernel is as follows,
- * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop
- * Ray_coop -------------------------------------------|                           |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * PathState_coop -------------------------------------|                           |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * kg (globals) ---------------------------------------|                           |
- * Intersection_coop ----------------------------------|                           |
- * ray_state ------------------------------------------|                           |
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----|                           |
- * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----|                           |
- * queuesize ------------------------------------------|                           |
- * use_queues_flag ------------------------------------|                           |
- * sw -------------------------------------------------|                           |
- * sh -------------------------------------------------|                           |
  */
-ccl_device void kernel_lamp_emission(
-        KernelGlobals *kg,
-        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
-        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
-        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
-        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
-        Intersection *Intersection_coop,       /* Required for lamp emission */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
-                                                * queues to fetch ray index
-                                                */
-        int ray_index)
+ccl_device void kernel_lamp_emission(KernelGlobals *kg)
 {
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
-	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
-	{
-		PathRadiance *L = &PathRadiance_coop[ray_index];
-		ccl_global PathState *state = &PathState_coop[ray_index];
+#ifndef __VOLUME__
+	/* We will empty this queue in this kernel. */
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+	}
+#endif
+	/* Fetch use_queues_flag. */
+	char local_use_queues_flag = *kernel_split_params.use_queues_flag;
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
-		float3 throughput = throughput_coop[ray_index];
-		Ray ray = Ray_coop[ray_index];
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(local_use_queues_flag) {
+		ray_index = get_ray_index(kg, ray_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+#ifndef __VOLUME__
+		                          1
+#else
+		                          0
+#endif
+		                          );
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	}
 
-#ifdef __LAMP_MIS__
-		if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
-			/* ray starting from previous non-transparent bounce */
-			Ray light_ray;
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND))
+	{
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 
-			light_ray.P = ray.P - state->ray_t*ray.D;
-			state->ray_t += Intersection_coop[ray_index].t;
-			light_ray.D = ray.D;
-			light_ray.t = state->ray_t;
-			light_ray.time = ray.time;
-			light_ray.dD = ray.dD;
-			light_ray.dP = ray.dP;
-			/* intersect with lamp */
-			float3 emission;
+		float3 throughput = kernel_split_state.throughput[ray_index];
+		Ray ray = kernel_split_state.ray[ray_index];
+		ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
-			if(indirect_lamp_emission(kg, kg->sd_input, state, &light_ray, &emission)) {
-				path_radiance_accum_emission(L, throughput, emission, state->bounce);
-			}
-		}
-#endif  /* __LAMP_MIS__ */
+		kernel_path_lamp_emission(kg, state, &ray, throughput, isect, emission_sd, L);
 	}
 }
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 816f3a6fbff..c3373174582 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -14,128 +14,230 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_setup_next_iteration kernel.
- * This is the tenth kernel in the ray tracing logic. This is the ninth
- * of the path iteration kernels. This kernel takes care of setting up
- * Ray for the next iteration of path-iteration and accumulating radiance
- * corresponding to AO and direct-lighting
+/*This kernel takes care of setting up ray for the next iteration of
+ * path-iteration and accumulating radiance corresponding to AO and
+ * direct-lighting
  *
- * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ * Ray state of rays that are terminated in this kernel are changed
+ * to RAY_UPDATE_BUFFER.
  *
- * The input and output are as follows,
+ * Note on queues:
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
+ * and processes only the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and
+ * reach RAY_UPDATE_BUFF state. These rays are enqueued into
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
+ * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
+ * been changed to RAY_UPDATE_BUFF, there is no problem.
  *
- * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * throughput_coop --------------------------------------|                                 |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * PathRadiance_coop ------------------------------------|                                 |--- throughput_coop
- * PathState_coop ---------------------------------------|                                 |--- PathRadiance_coop
- * sd ---------------------------------------------------|                                 |--- PathState_coop
- * ray_state --------------------------------------------|                                 |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------|                                 |--- Ray_coop
- * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                 |--- use_queues_flag
- * Ray_coop ---------------------------------------------|                                 |
- * kg (globals) -----------------------------------------|                                 |
- * LightRay_dl_coop -------------------------------------|
- * ISLamp_coop ------------------------------------------|
- * BSDFEval_coop ----------------------------------------|
- * LightRay_ao_coop -------------------------------------|
- * AOBSDF_coop ------------------------------------------|
- * AOAlpha_coop -----------------------------------------|
- *
- * Note on queues,
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
- * the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF
- * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
- * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
- * changed to RAY_UPDATE_BUFF, there is no problem.
- *
- * State of queues when this kernel is called :
+ * State of queues when this kernel is called:
  * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
  * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
  */
-ccl_device char kernel_next_iteration_setup(
-        KernelGlobals *kg,
-        ShaderData *sd,                       /* Required for setting up ray for next iteration */
-        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
-        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
-        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
-        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
-        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
-        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
-        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
-        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
-        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
-        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
-        ccl_global char *ray_state,           /* Denotes the state of each ray */
-        ccl_global char *use_queues_flag,     /* flag to decide if scene_intersect kernel should
-                                               * use queues to fetch ray index */
-        int ray_index)
+
+#ifdef __BRANCHED_PATH__
+ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
 {
-	char enqueue_flag = 0;
-
-	/* Load ShaderData structure. */
-	PathRadiance *L = NULL;
-	ccl_global PathState *state = NULL;
-
-	/* Path radiance update for AO/Direct_lighting's shadow blocked. */
-	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
-	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
-	{
-		state = &PathState_coop[ray_index];
-		L = &PathRadiance_coop[ray_index];
-		float3 _throughput = throughput_coop[ray_index];
-
-		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-			float3 shadow = LightRay_ao_coop[ray_index].P;
-			char update_path_radiance = LightRay_ao_coop[ray_index].t;
-			if(update_path_radiance) {
-				path_radiance_accum_ao(L,
-				                       _throughput,
-				                       AOAlpha_coop[ray_index],
-				                       AOBSDF_coop[ray_index],
-				                       shadow,
-				                       state->bounce);
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
+}
+
+ccl_device void kernel_split_branched_indirect_light_end(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+
+	/* continue in case of transparency */
+	*throughput *= shader_bsdf_transparency(kg, sd);
+
+	if(is_zero(*throughput)) {
+		kernel_split_path_end(kg, ray_index);
+	}
+	else {
+		/* Update Path State */
+		state->flag |= PATH_RAY_TRANSPARENT;
+		state->transparent_bounce++;
+
+		ray->P = ray_offset(sd->P, -sd->Ng);
+		ray->t -= sd->ray_length; /* clipping works through transparent */
+
+#  ifdef __RAY_DIFFERENTIALS__
+		ray->dP = sd->dP;
+		ray->dD.dx = -sd->dI.dx;
+		ray->dD.dy = -sd->dI.dy;
+#  endif  /* __RAY_DIFFERENTIALS__ */
+
+#  ifdef __VOLUME__
+		/* enter/exit volume */
+		kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
+#  endif  /* __VOLUME__ */
+	}
+}
+#endif  /* __BRANCHED_PATH__ */
+
+ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
+                                            ccl_local_param unsigned int *local_queue_atomics)
+{
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		/* If we are here, then it means that scene-intersect kernel
+		* has already been executed atleast once. From the next time,
+		* scene-intersect kernel may operate on queues to fetch ray index
+		*/
+		*kernel_split_params.use_queues_flag = 1;
+
+		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
+		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
+		 * previous kernel.
+		 */
+		kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+	bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
+	if(active) {
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		ShaderData *sd = &kernel_split_state.sd[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+#ifdef __BRANCHED_PATH__
+		if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+			/* Compute direct lighting and next bounce. */
+			if(!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) {
+				kernel_split_path_end(kg, ray_index);
 			}
-			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+#ifdef __BRANCHED_PATH__
 		}
+		else {
+			kernel_split_branched_indirect_light_init(kg, ray_index);
 
-		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-			float3 shadow = LightRay_dl_coop[ray_index].P;
-			char update_path_radiance = LightRay_dl_coop[ray_index].t;
-			if(update_path_radiance) {
-				BsdfEval L_light = BSDFEval_coop[ray_index];
-				path_radiance_accum_light(L,
-				                          _throughput,
-				                          &L_light,
-				                          shadow,
-				                          1.0f,
-				                          state->bounce,
-				                          ISLamp_coop[ray_index]);
+			if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+			                                                          ray_index,
+			                                                          1.0f,
+			                                                          &kernel_split_state.branched_state[ray_index].sd,
+			                                                          true,
+			                                                          true))
+			{
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+			}
+			else {
+				kernel_split_branched_indirect_light_end(kg, ray_index);
 			}
-			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 		}
+#endif  /* __BRANCHED_PATH__ */
 	}
 
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		ccl_global float3 *throughput = &throughput_coop[ray_index];
-		ccl_global Ray *ray = &Ray_coop[ray_index];
-		ccl_global RNG *rng = &rng_coop[ray_index];
-		state = &PathState_coop[ray_index];
-		L = &PathRadiance_coop[ray_index];
-
-		/* Compute direct lighting and next bounce. */
-		if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			enqueue_flag = 1;
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#ifdef __BRANCHED_PATH__
+	/* iter loop */
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
+	}
+
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_LIGHT_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+
+		path_radiance_sum_indirect(L);
+		path_radiance_reset_indirect(L);
+
+		if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+		                                                          ray_index,
+		                                                          1.0f,
+		                                                          &kernel_split_state.branched_state[ray_index].sd,
+		                                                          true,
+		                                                          true))
+		{
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+		else {
+			kernel_split_branched_indirect_light_end(kg, ray_index);
 		}
 	}
 
-	return enqueue_flag;
+#  ifdef __VOLUME__
+	/* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_VOLUME_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#  endif  /* __VOLUME__ */
+
+#  ifdef __SUBSURFACE__
+	/* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SUBSURFACE_INDIRECT_ITER,
+	                        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#  endif  /* __SUBSURFACE__ */
+#endif  /* __BRANCHED_PATH__ */
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
new file mode 100644
index 00000000000..5ad62b585fe
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel initializes structures needed in path-iteration kernels.
+ * This is the first kernel in ray-tracing logic.
+ *
+ * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
+ */
+ccl_device void kernel_path_init(KernelGlobals *kg) {
+	int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
+
+	/* This is the first assignment to ray_state;
+	 * So we dont use ASSIGN_RAY_STATE macro.
+	 */
+	kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
+
+	/* Get work. */
+	ccl_global uint *work_pools = kernel_split_params.work_pools;
+	uint total_work_size = kernel_split_params.total_work_size;
+	uint work_index;
+
+	if(!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
+		/* No more work, mark ray as inactive */
+		kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
+
+		return;
+	}
+
+	ccl_global WorkTile *tile = &kernel_split_params.tile;
+	uint x, y, sample;
+	get_work_pixel(tile, work_index, &x, &y, &sample);
+
+	/* Store buffer offset for writing to passes. */
+	uint buffer_offset = (tile->offset + x + y*tile->stride) * kernel_data.film.pass_stride;
+	kernel_split_state.buffer_offset[ray_index] = buffer_offset;
+
+	/* Initialize random numbers and ray. */
+	uint rng_hash;
+	kernel_path_trace_setup(kg,
+	                        sample,
+	                        x, y,
+	                        &rng_hash,
+	                        &kernel_split_state.ray[ray_index]);
+
+	if(kernel_split_state.ray[ray_index].t != 0.0f) {
+		/* Initialize throughput, path radiance, Ray, PathState;
+		 * These rays proceed with path-iteration.
+		 */
+		kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
+		path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass);
+		path_state_init(kg,
+		                &kernel_split_state.sd_DL_shadow[ray_index],
+		                &kernel_split_state.path_state[ray_index],
+		                rng_hash,
+		                sample,
+		                &kernel_split_state.ray[ray_index]);
+#ifdef __SUBSURFACE__
+		kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
+#endif
+	}
+	else {
+		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
new file mode 100644
index 00000000000..66ce2dfb6f1
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel enqueues rays of different ray state into their
+ * appropriate queues:
+ *
+ * 1. Rays that have been determined to hit the background from the
+ *    "kernel_scene_intersect" kernel are enqueued in
+ *    QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in pat
+ *    -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * State of queue during other times this kernel is called:
+ * At entry,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE
+ *     and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
+ccl_device void kernel_queue_enqueue(KernelGlobals *kg,
+                                     ccl_local_param QueueEnqueueLocals *locals)
+{
+	/* We have only 2 cases (Hit/Not-Hit) */
+	int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+	if(lidx == 0) {
+		locals->queue_atomics[0] = 0;
+		locals->queue_atomics[1] = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int queue_number = -1;
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
+		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+	}
+	else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
+	        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
+		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	}
+
+	unsigned int my_lqidx;
+	if(queue_number != -1) {
+		my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics);
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	if(lidx == 0) {
+		locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
+		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                                    locals->queue_atomics,
+		                                    kernel_split_params.queue_index);
+		locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
+		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+		                                    locals->queue_atomics,
+		                                    kernel_split_params.queue_index);
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	unsigned int my_gqidx;
+	if(queue_number != -1) {
+		my_gqidx = get_global_queue_index(queue_number,
+		                                  kernel_split_params.queue_size,
+		                                  my_lqidx,
+		                                  locals->queue_atomics);
+		kernel_split_state.queue_data[my_gqidx] = ray_index;
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index fc4b4ee38e5..f5378bc172b 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -14,119 +14,66 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_scene_intersect kernel.
- * This is the second kernel in the ray tracing logic. This is the first
- * of the path iteration kernels. This kernel takes care of scene_intersect function.
+/* This kernel takes care of scene_intersect function.
  *
  * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
  * This kernel processes rays of ray state RAY_ACTIVE
- * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND.
- *
- * The input and output are as follows,
- *
- * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState
- * PathState_coop ---------------------------------|                                          |--- Intersection
- * ray_state --------------------------------------|                                          |--- ray_state
- * use_queues_flag --------------------------------|                                          |
- * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                                          |
- * kg (globals) -----------------------------------|                                          |
- * rng_coop ---------------------------------------|                                          |
- * sw ---------------------------------------------|                                          |
- * sh ---------------------------------------------|                                          |
- * queuesize --------------------------------------|                                          |
- *
- * Note on Queues :
- * Ideally we would want kernel_scene_intersect to work on queues.
- * But during the very first time, the queues will be empty and hence we perform a direct mapping
- * between ray-index and thread-index; From the next time onward, the queue will be filled and
- * we may start operating on queues.
- *
- * State of queue during the first time this kernel is called :
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel
- *
- * State of queues during other times this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays;
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ;
- * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These
- * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing
- * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays
- * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues)
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and
- * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
+ * This kernel determines the rays that have hit the background and changes
+ * their ray state to RAY_HIT_BACKGROUND.
  */
-
-ccl_device void kernel_scene_intersect(
-        KernelGlobals *kg,
-        ccl_global uint *rng_coop,
-        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
-        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
-        Intersection *Intersection_coop,       /* Required for scene_intersect */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
-                                                * queues to fetch ray index */
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int ray_index)
+ccl_device void kernel_scene_intersect(KernelGlobals *kg)
 {
-	/* All regenerated rays become active here */
-	if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
-
-	if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE))
-		return;
+	/* Fetch use_queues_flag */
+	char local_use_queues_flag = *kernel_split_params.use_queues_flag;
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
-#ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &debugdata_coop[ray_index];
-#endif
-	Intersection *isect = &Intersection_coop[ray_index];
-	PathState state = PathState_coop[ray_index];
-	Ray ray = Ray_coop[ray_index];
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(local_use_queues_flag) {
+		ray_index = get_ray_index(kg, ray_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          0);
 
-	/* intersect scene */
-	uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
-	float difl = 0.0f, extmax = 0.0f;
-	uint lcg_state = 0;
-	RNG rng = rng_coop[ray_index];
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	}
 
-	if(kernel_data.bvh.have_curves) {
-		if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
-			float3 pixdiff = ray.dD.dx + ray.dD.dy;
-			/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-			difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+	/* All regenerated rays become active here */
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
+#ifdef __BRANCHED_PATH__
+		if(kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) {
+			kernel_split_path_end(kg, ray_index);
+		}
+		else
+#endif  /* __BRANCHED_PATH__ */
+		{
+			ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
 		}
+	}
 
-		extmax = kernel_data.curve.maximum_width;
-		lcg_state = lcg_state_init(&rng, &state, 0x51633e2d);
+	if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		return;
 	}
 
-	bool hit = scene_intersect(kg, ray, visibility, isect, &lcg_state, difl, extmax);
-#else
-	bool hit = scene_intersect(kg, ray, visibility, isect, NULL, 0.0f, 0.0f);
-#endif
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	Ray ray = kernel_split_state.ray[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 
-#ifdef __KERNEL_DEBUG__
-	if(state.flag & PATH_RAY_CAMERA) {
-		debug_data->num_bvh_traversal_steps += isect->num_traversal_steps;
-		debug_data->num_bvh_traversed_instances += isect->num_traversed_instances;
-	}
-	debug_data->num_ray_bounces++;
-#endif
+	Intersection isect;
+	bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L);
+	kernel_split_state.isect[ray_index] = isect;
 
 	if(!hit) {
 		/* Change the state of rays that hit the background;
 		 * These rays undergo special processing in the
 		 * background_bufferUpdate kernel.
 		 */
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
 	}
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index cef64bf5f36..7032461b04a 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2017 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,57 +14,53 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_shader_eval kernel
- * This kernel is the 5th kernel in the ray tracing logic. This is
- * the 4rd kernel in path iteration. This kernel sets up the ShaderData
- * structure from the values computed by the previous kernels. It also identifies
- * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- *
- * The input and output of the kernel is as follows,
- * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- sd
- * Ray_coop -------------------------------------------|                         |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * PathState_coop -------------------------------------|                         |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Intersection_coop ----------------------------------|                         |
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------|                         |
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---|                         |
- * ray_state ------------------------------------------|                         |
- * kg (globals) ---------------------------------------|                         |
- * queuesize ------------------------------------------|                         |
- *
- * Note on Queues :
- * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
- * only the rays of state RAY_ACTIVE;
- * State of queues when this kernel is called,
- * at entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * at exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
+/* This kernel evaluates ShaderData structure from the values computed
+ * by the previous kernels.
  */
-ccl_device void kernel_shader_eval(
-        KernelGlobals *kg,
-        ShaderData *sd,                        /* Output ShaderData structure to be filled */
-        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
-        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
-        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
-        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int ray_index)
+ccl_device void kernel_shader_eval(KernelGlobals *kg)
 {
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	/* Sorting on cuda split is not implemented */
+#ifdef __KERNEL_CUDA__
+	int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+#else
+	int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS];
+#endif
+	if(ray_index >= queue_index) {
+		return;
+	}
+	ray_index = get_ray_index(kg, ray_index,
+#ifdef __KERNEL_CUDA__
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+#else
+	                          QUEUE_SHADER_SORTED_RAYS,
+#endif
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		Intersection *isect = &Intersection_coop[ray_index];
-		ccl_global uint *rng = &rng_coop[ray_index];
-		ccl_global PathState *state = &PathState_coop[ray_index];
-		Ray ray = Ray_coop[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 
-		shader_setup_from_ray(kg,
-		                      sd,
-		                      isect,
-		                      &ray);
-		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
-		shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, state->flag);
+#ifdef __BRANCHED_PATH__
+		if(kernel_data.integrator.branched) {
+			shader_merge_closures(&kernel_split_state.sd[ray_index]);
+		}
+		else
+#endif
+		{
+			shader_prepare_closures(&kernel_split_state.sd[ray_index], state);
+		}
 	}
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h
new file mode 100644
index 00000000000..0432689d9fa
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_setup.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel sets up the ShaderData structure from the values computed
+ * by the previous kernels.
+ *
+ * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
+ * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ */
+ccl_device void kernel_shader_setup(KernelGlobals *kg,
+                                    ccl_local_param unsigned int *local_queue_atomics)
+{
+	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+	if(ray_index >= queue_index) {
+		return;
+	}
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+	/* Continue on with shader evaluation. */
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		Intersection isect = kernel_split_state.isect[ray_index];
+		Ray ray = kernel_split_state.ray[ray_index];
+
+		shader_setup_from_ray(kg,
+		                      &kernel_split_state.sd[ray_index],
+		                      &isect,
+		                      &ray);
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
new file mode 100644
index 00000000000..5a55b680695
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shader_sort.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+
+ccl_device void kernel_shader_sort(KernelGlobals *kg,
+                                   ccl_local_param ShaderSortLocals *locals)
+{
+#ifndef __KERNEL_CUDA__
+	int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
+	if(tid == 0) {
+		kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize;
+	}
+
+	uint offset = (tid/SHADER_SORT_LOCAL_SIZE)*SHADER_SORT_BLOCK_SIZE;
+	if(offset >= qsize) {
+		return;
+	}
+
+	int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
+	uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size);
+	uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size);
+	ccl_local uint *local_value = &locals->local_value[0];
+	ccl_local ushort *local_index = &locals->local_index[0];
+
+	/* copy to local memory */
+	for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+		uint idx = offset + i + lid;
+		uint add = input + idx;
+		uint value = (~0);
+		if(idx < qsize) {
+			int ray_index = kernel_split_state.queue_data[add];
+			bool valid = (ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+			if(valid) {
+				value = kernel_split_state.sd[ray_index].shader & SHADER_MASK;
+			}
+		}
+		local_value[i + lid] = value;
+		local_index[i + lid] = i + lid;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	/* skip sorting for cpu split kernel */
+#  ifdef __KERNEL_OPENCL__
+
+	/* bitonic sort */
+	for(uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
+		for(uint inc = length; inc > 0; inc >>= 1) {
+			for(uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
+				uint i = lid + ii;
+				bool direction = ((i & (length << 1)) != 0);
+				uint j = i ^ inc;
+				ushort ioff = local_index[i];
+				ushort joff = local_index[j];
+				uint iKey = local_value[ioff];
+				uint jKey = local_value[joff];
+				bool smaller = (jKey < iKey) || (jKey == iKey && j < i);
+				bool swap = smaller ^ (j < i) ^ direction;
+				ccl_barrier(CCL_LOCAL_MEM_FENCE);
+				local_index[i] = (swap) ? joff : ioff;
+				local_index[j] = (swap) ? ioff : joff;
+				ccl_barrier(CCL_LOCAL_MEM_FENCE);
+			}
+		}
+	}
+#  endif /* __KERNEL_OPENCL__ */
+
+	/* copy to destination */
+	for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+		uint idx = offset + i + lid;
+		uint lidx = local_index[i + lid];
+		uint outi = output + idx;
+		uint ini = input + offset + lidx;
+		uint value = local_value[lidx];
+		if(idx < qsize) {
+			kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT : kernel_split_state.queue_data[ini];
+		}
+	}
+#endif /* __KERNEL_CUDA__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h
deleted file mode 100644
index 6153af47f96..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split_common.h"
-
-/* Note on kernel_shadow_blocked kernel.
- * This is the ninth kernel in the ray tracing logic. This is the eighth
- * of the path iteration kernels. This kernel takes care of "shadow ray cast"
- * logic of the direct lighting and AO  part of ray tracing.
- *
- * The input and output are as follows,
- *
- * PathState_coop ----------------------------------|--- kernel_shadow_blocked --|
- * LightRay_dl_coop --------------------------------|                            |--- LightRay_dl_coop
- * LightRay_ao_coop --------------------------------|                            |--- LightRay_ao_coop
- * ray_state ---------------------------------------|                            |--- ray_state
- * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS &       |                            |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
-              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
- * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
-              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
- * kg (globals) ------------------------------------|                            |
- * queuesize ---------------------------------------|                            |
- *
- * Note on sd_shadow : sd_shadow is neither input nor output to this kernel. sd_shadow is filled and consumed in this kernel itself.
- * Note on queues :
- * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty
- * these queues this kernel.
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO
- * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
- */
-ccl_device void kernel_shadow_blocked(
-        KernelGlobals *kg,
-        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
-        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
-        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
-        ccl_global char *ray_state,
-        char shadow_blocked_type,
-        int ray_index)
-{
-	/* Flag determining if we need to update L. */
-	char update_path_radiance = 0;
-
-	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
-	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
-	{
-		ccl_global PathState *state = &PathState_coop[ray_index];
-		ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index];
-		ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index];
-
-		ccl_global Ray *light_ray_global =
-		        shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO
-		                ? light_ray_ao_global
-		                : light_ray_dl_global;
-
-		float3 shadow;
-		update_path_radiance = !(shadow_blocked(kg,
-		                                        kg->sd_input,
-		                                        state,
-		                                        light_ray_global,
-		                                        &shadow));
-
-		/* We use light_ray_global's P and t to store shadow and
-		 * update_path_radiance.
-		 */
-		light_ray_global->P = shadow;
-		light_ray_global->t = update_path_radiance;
-	}
-}
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
new file mode 100644
index 00000000000..79aa2c9435b
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Shadow ray cast for AO. */
+ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
+{
+	unsigned int ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index < ao_queue_length) {
+		ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+		                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
+	}
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	float3 throughput = kernel_split_state.throughput[ray_index];
+
+#ifdef __BRANCHED_PATH__
+	if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+#endif
+		kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd));
+#ifdef __BRANCHED_PATH__
+	}
+	else {
+		kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput);
+	}
+#endif
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
new file mode 100644
index 00000000000..b52f9a5eb81
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Shadow ray cast for direct visible light. */
+ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
+{
+	unsigned int dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index < dl_queue_length) {
+		ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+		                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
+	}
+
+#ifdef __BRANCHED_PATH__
+	/* TODO(mai): move this somewhere else? */
+	if(thread_index == 0) {
+		/* Clear QUEUE_INACTIVE_RAYS before next kernel. */
+		kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0;
+	}
+#endif  /* __BRANCHED_PATH__ */
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	Ray ray = kernel_split_state.light_ray[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	float3 throughput = kernel_split_state.throughput[ray_index];
+
+	BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+	bool is_lamp = kernel_split_state.is_lamp[ray_index];
+
+#  if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
+	bool use_branched = false;
+	int all = 0;
+
+	if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		use_branched = true;
+		all = 1;
+	}
+#    if defined(__BRANCHED_PATH__)
+	else if(kernel_data.integrator.branched) {
+		use_branched = true;
+
+		if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+			all = (kernel_data.integrator.sample_all_lights_indirect);
+		}
+		else
+		{
+			all = (kernel_data.integrator.sample_all_lights_direct);
+		}
+	}
+#    endif  /* __BRANCHED_PATH__ */
+
+	if(use_branched) {
+		kernel_branched_path_surface_connect_light(kg,
+		                                           sd,
+		                                           emission_sd,
+		                                           state,
+		                                           throughput,
+		                                           1.0f,
+		                                           L,
+		                                           all);
+	}
+	else
+#  endif  /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
+	{
+		/* trace shadow ray */
+		float3 shadow;
+
+		if(!shadow_blocked(kg,
+		                   sd,
+		                   emission_sd,
+		                   state,
+		                   &ray,
+		                   &shadow))
+		{
+			/* accumulate */
+			path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
+		}
+		else {
+			path_radiance_accum_total_light(L, state, throughput, &L_light);
+		}
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 2135ee22b2e..21886ee62ee 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -17,48 +17,78 @@
 #ifndef  __KERNEL_SPLIT_H__
 #define  __KERNEL_SPLIT_H__
 
-#include "kernel_compat_opencl.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_image_opencl.h"
-
-#include "util_atomic.h"
-
-#include "kernel_random.h"
-#include "kernel_projection.h"
-#include "kernel_montecarlo.h"
-#include "kernel_differential.h"
-#include "kernel_camera.h"
-
-#include "geom/geom.h"
-#include "bvh/bvh.h"
-
-#include "kernel_accumulate.h"
-#include "kernel_shader.h"
-#include "kernel_light.h"
-#include "kernel_passes.h"
-
-#ifdef __SUBSURFACE__
-#include "kernel_subsurface.h"
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/split/kernel_split_data.h"
+
+#include "kernel/kernel_globals.h"
+
+#ifdef __OSL__
+#  include "kernel/osl/osl_shader.h"
+#endif
+
+#ifdef __KERNEL_OPENCL__
+#  include "kernel/kernels/opencl/kernel_opencl_image.h"
+#endif
+#ifdef __KERNEL_CUDA__
+#  include "kernel/kernels/cuda/kernel_cuda_image.h"
+#endif
+#ifdef __KERNEL_CPU__
+#  include "kernel/kernels/cpu/kernel_cpu_image.h"
+#endif
+
+#include "util/util_atomic.h"
+
+#include "kernel/kernel_path.h"
+#ifdef __BRANCHED_PATH__
+#  include "kernel/kernel_path_branched.h"
 #endif
 
-#ifdef __VOLUME__
-#include "kernel_volume.h"
+#include "kernel/kernel_queues.h"
+#include "kernel/kernel_work_stealing.h"
+
+#ifdef __BRANCHED_PATH__
+#  include "kernel/split/kernel_branched.h"
 #endif
 
-#include "kernel_path_state.h"
-#include "kernel_shadow.h"
-#include "kernel_emission.h"
-#include "kernel_path_common.h"
-#include "kernel_path_surface.h"
-#include "kernel_path_volume.h"
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
+{
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+#ifdef __BRANCHED_PATH__
+	if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) {
+		int orig_ray = kernel_split_state.branched_state[ray_index].original_ray;
+
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
+
+		path_radiance_sum_indirect(L);
+		path_radiance_accum_sample(orig_ray_L, L);
+
+		atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
 
-#ifdef __KERNEL_DEBUG__
-#include "kernel_debug.h"
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
+	}
+	else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
+	}
+	else {
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+	}
+#else
+	ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
 #endif
+}
 
-#include "kernel_queues.h"
-#include "kernel_work_stealing.h"
+CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
new file mode 100644
index 00000000000..eac22050a38
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_SPLIT_DATA_H__
+#define __KERNEL_SPLIT_DATA_H__
+
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
+{
+	(void)kg;  /* Unused on CPU. */
+
+	uint64_t size = 0;
+#define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16)
+	size = size SPLIT_DATA_ENTRIES;
+#undef SPLIT_DATA_ENTRY
+
+	return size;
+}
+
+ccl_device_inline void split_data_init(KernelGlobals *kg,
+                                       ccl_global SplitData *split_data,
+                                       size_t num_elements,
+                                       ccl_global void *data,
+                                       ccl_global char *ray_state)
+{
+	(void)kg;  /* Unused on CPU. */
+
+	ccl_global char *p = (ccl_global char*)data;
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+	split_data->name = (type*)p; p += align_up(num_elements * num * sizeof(type), 16);
+	SPLIT_DATA_ENTRIES;
+#undef SPLIT_DATA_ENTRY
+
+	split_data->ray_state = ray_state;
+}
+
+CCL_NAMESPACE_END
+
+#endif  /* __KERNEL_SPLIT_DATA_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
new file mode 100644
index 00000000000..b0e6e5f5250
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_SPLIT_DATA_TYPES_H__
+#define __KERNEL_SPLIT_DATA_TYPES_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */
+
+typedef struct SplitParams {
+	WorkTile tile;
+	uint total_work_size;
+
+	ccl_global unsigned int *work_pools;
+
+	ccl_global int *queue_index;
+	int queue_size;
+	ccl_global char *use_queues_flag;
+
+	/* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */
+	int dummy_sd_flag;
+} SplitParams;
+
+/* Global memory variables [porting]; These memory is used for
+ * co-operation between different kernels; Data written by one
+ * kernel will be available to another kernel via this global
+ * memory.
+ */
+
+/* SPLIT_DATA_ENTRY(type, name, num) */
+
+#ifdef __BRANCHED_PATH__
+
+typedef ccl_global struct SplitBranchedState {
+	/* various state that must be kept and restored after an indirect loop */
+	PathState path_state;
+	float3 throughput;
+	Ray ray;
+
+	struct ShaderData sd;
+	Intersection isect;
+
+	char ray_state;
+
+	/* indirect loop state */
+	int next_closure;
+	int next_sample;
+
+#ifdef __SUBSURFACE__
+	int ss_next_closure;
+	int ss_next_sample;
+	int next_hit;
+	int num_hits;
+
+	uint lcg_state;
+	SubsurfaceIntersection ss_isect;
+
+#  ifdef __VOLUME__
+	VolumeStack volume_stack[VOLUME_STACK_SIZE];
+#  endif  /* __VOLUME__ */
+#endif  /*__SUBSURFACE__ */
+
+	int shared_sample_count; /* number of branched samples shared with other threads */
+	int original_ray; /* index of original ray when sharing branched samples */
+	bool waiting_on_shared_samples;
+} SplitBranchedState;
+
+#define SPLIT_DATA_BRANCHED_ENTRIES \
+	SPLIT_DATA_ENTRY( SplitBranchedState, branched_state, 1)
+#else
+#define SPLIT_DATA_BRANCHED_ENTRIES
+#endif  /* __BRANCHED_PATH__ */
+
+#ifdef __SUBSURFACE__
+#  define SPLIT_DATA_SUBSURFACE_ENTRIES \
+	SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1)
+#else
+#  define SPLIT_DATA_SUBSURFACE_ENTRIES
+#endif /* __SUBSURFACE__ */
+
+#ifdef __VOLUME__
+#  define SPLIT_DATA_VOLUME_ENTRIES \
+	SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1)
+#else
+#  define SPLIT_DATA_VOLUME_ENTRIES
+#endif /* __VOLUME__ */
+
+#define SPLIT_DATA_ENTRIES \
+	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+	SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
+	SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
+	SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+	SPLIT_DATA_SUBSURFACE_ENTRIES \
+	SPLIT_DATA_VOLUME_ENTRIES \
+	SPLIT_DATA_BRANCHED_ENTRIES \
+
+/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */
+#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
+	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+	SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
+	SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+	SPLIT_DATA_SUBSURFACE_ENTRIES \
+	SPLIT_DATA_VOLUME_ENTRIES \
+	SPLIT_DATA_BRANCHED_ENTRIES \
+
+/* struct that holds pointers to data in the shared state buffer */
+typedef struct SplitData {
+#define SPLIT_DATA_ENTRY(type, name, num) type *name;
+	SPLIT_DATA_ENTRIES
+#undef SPLIT_DATA_ENTRY
+
+	/* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
+	 * the host easily) but is still used the same as the other data so we have it here in this struct as well
+	 */
+	ccl_global char *ray_state;
+} SplitData;
+
+#ifndef __KERNEL_CUDA__
+#  define kernel_split_state (kg->split_data)
+#  define kernel_split_params (kg->split_param_data)
+#else
+__device__ SplitData __split_data;
+#  define kernel_split_state (__split_data)
+__device__ SplitParams __split_param_data;
+#  define kernel_split_params (__split_param_data)
+#endif  /* __KERNEL_CUDA__ */
+
+/* Local storage for queue_enqueue kernel. */
+typedef struct QueueEnqueueLocals {
+	uint queue_atomics[2];
+} QueueEnqueueLocals;
+
+/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */
+typedef struct BackgroundAOLocals {
+	uint queue_atomics_bg;
+	uint queue_atomics_ao;
+} BackgroundAOLocals;
+
+typedef struct ShaderSortLocals {
+	uint local_value[SHADER_SORT_BLOCK_SIZE];
+	ushort local_index[SHADER_SORT_BLOCK_SIZE];
+} ShaderSortLocals;
+
+CCL_NAMESPACE_END
+
+#endif  /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
new file mode 100644
index 00000000000..3b957856aea
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -0,0 +1,313 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
+
+ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg, int ray_index)
+{
+	kernel_split_branched_path_indirect_loop_init(kg, ray_index);
+
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	branched_state->ss_next_closure = 0;
+	branched_state->ss_next_sample = 0;
+
+	branched_state->num_hits = 0;
+	branched_state->next_hit = 0;
+
+	ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
+}
+
+ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(KernelGlobals *kg, int ray_index)
+{
+	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
+
+	ShaderData *sd = &branched_state->sd;
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+	for(int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSSRDF(sc->type))
+			continue;
+
+		/* set up random number generator */
+		if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
+		   branched_state->next_closure == 0 && branched_state->next_sample == 0)
+		{
+			branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state,
+			                                                     0x68bc21eb);
+		}
+		int num_samples = kernel_data.integrator.subsurface_samples;
+		float num_samples_inv = 1.0f/num_samples;
+		uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
+
+		/* do subsurface scatter step with copy of shader data, this will
+		 * replace the BSSRDF with a diffuse BSDF closure */
+		for(int j = branched_state->ss_next_sample; j < num_samples; j++) {
+			ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect;
+			float bssrdf_u, bssrdf_v;
+			path_branched_rng_2D(kg,
+			                     bssrdf_rng_hash,
+			                     &branched_state->path_state,
+			                     j,
+			                     num_samples,
+			                     PRNG_BSDF_U,
+			                     &bssrdf_u,
+			                     &bssrdf_v);
+
+			/* intersection is expensive so avoid doing multiple times for the same input */
+			if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+				uint lcg_state = branched_state->lcg_state;
+				SubsurfaceIntersection ss_isect_private;
+
+				branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
+				                                                              &ss_isect_private,
+				                                                              sd,
+				                                                              sc,
+				                                                              &lcg_state,
+				                                                              bssrdf_u, bssrdf_v,
+				                                                              true);
+
+				branched_state->lcg_state = lcg_state;
+				*ss_isect = ss_isect_private;
+			}
+
+#ifdef __VOLUME__
+			Ray volume_ray = branched_state->ray;
+			bool need_update_volume_stack =
+			        kernel_data.integrator.use_volumes &&
+			        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#endif  /* __VOLUME__ */
+
+			/* compute lighting with the BSDF closure */
+			for(int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
+				ShaderData *bssrdf_sd = &kernel_split_state.sd[ray_index];
+				*bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
+				                   * important as the indirect path will write into bssrdf_sd */
+
+				SubsurfaceIntersection ss_isect_private = *ss_isect;
+				subsurface_scatter_multi_setup(kg,
+				                               &ss_isect_private,
+				                               hit,
+				                               bssrdf_sd,
+				                               &branched_state->path_state,
+				                               branched_state->path_state.flag,
+				                               sc,
+				                               true);
+				*ss_isect = ss_isect_private;
+
+				ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
+				*hit_state = branched_state->path_state;
+
+				path_state_branch(hit_state, j, num_samples);
+
+#ifdef __VOLUME__
+				if(need_update_volume_stack) {
+					/* Setup ray from previous surface point to the new one. */
+					float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
+					volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
+
+					/* this next part is expensive as it does scene intersection so only do once */
+					if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+						for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+							branched_state->volume_stack[k] = hit_state->volume_stack[k];
+						}
+
+						kernel_volume_stack_update_for_subsurface(kg,
+						                                          emission_sd,
+						                                          &volume_ray,
+						                                          branched_state->volume_stack);
+					}
+
+					for(int k = 0; k < VOLUME_STACK_SIZE; k++) {
+						hit_state->volume_stack[k] = branched_state->volume_stack[k];
+					}
+				}
+#endif  /* __VOLUME__ */
+
+#ifdef __EMISSION__
+				if(branched_state->next_closure == 0 && branched_state->next_sample == 0) {
+					/* direct light */
+					if(kernel_data.integrator.use_direct_light) {
+						int all = (kernel_data.integrator.sample_all_lights_direct) ||
+							      (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER);
+						kernel_branched_path_surface_connect_light(kg,
+						                                           bssrdf_sd,
+						                                           emission_sd,
+						                                           hit_state,
+						                                           branched_state->throughput,
+						                                           num_samples_inv,
+						                                           L,
+						                                           all);
+					}
+				}
+#endif  /* __EMISSION__ */
+
+				/* indirect light */
+				if(kernel_split_branched_path_surface_indirect_light_iter(kg,
+				                                                          ray_index,
+				                                                          num_samples_inv,
+				                                                          bssrdf_sd,
+				                                                          false,
+				                                                          false))
+				{
+					branched_state->ss_next_closure = i;
+					branched_state->ss_next_sample = j;
+					branched_state->next_hit = hit;
+
+					return true;
+				}
+
+				branched_state->next_closure = 0;
+			}
+
+			branched_state->next_hit = 0;
+		}
+
+		branched_state->ss_next_sample = 0;
+	}
+
+	branched_state->ss_next_closure = sd->num_closure;
+
+	branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+	if(branched_state->waiting_on_shared_samples) {
+		return true;
+	}
+
+	kernel_split_branched_path_indirect_loop_end(kg, ray_index);
+
+	return false;
+}
+
+#endif  /* __BRANCHED_PATH__ && __SUBSURFACE__ */
+
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
+{
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index == 0) {
+		/* We will empty both queues in this kernel. */
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+	get_ray_index(kg, thread_index,
+	              QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	              kernel_split_state.queue_data,
+	              kernel_split_params.queue_size,
+	              1);
+
+#ifdef __SUBSURFACE__
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+		ShaderData *sd = &kernel_split_state.sd[ray_index];
+		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+		if(sd->flag & SD_BSSRDF) {
+
+#ifdef __BRANCHED_PATH__
+			if(!kernel_data.integrator.branched) {
+#endif
+				if(kernel_path_subsurface_scatter(kg,
+				                                  sd,
+				                                  emission_sd,
+				                                  L,
+				                                  state,
+				                                  ray,
+				                                  throughput,
+				                                  ss_indirect))
+				{
+					kernel_split_path_end(kg, ray_index);
+				}
+#ifdef __BRANCHED_PATH__
+			}
+			else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
+				float bssrdf_u, bssrdf_v;
+				path_state_rng_2D(kg,
+				                  state,
+				                  PRNG_BSDF_U,
+				                  &bssrdf_u, &bssrdf_v);
+
+				const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
+
+				/* do bssrdf scatter step if we picked a bssrdf closure */
+				if(sc) {
+					uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
+					subsurface_scatter_step(kg,
+					                        sd,
+					                        state,
+					                        state->flag,
+					                        sc,
+					                        &lcg_state,
+					                        bssrdf_u, bssrdf_v,
+					                        false);
+				}
+			}
+			else {
+				kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
+
+				if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				}
+			}
+#endif
+		}
+	}
+
+#  ifdef __BRANCHED_PATH__
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
+	}
+
+	/* iter loop */
+	ray_index = get_ray_index(kg, ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
+	                          QUEUE_SUBSURFACE_INDIRECT_ITER,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+	if(IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
+		/* for render passes, sum and reset indirect light pass variables
+		 * for the next samples */
+		path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
+		path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
+
+		if(kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+	}
+#  endif  /* __BRANCHED_PATH__ */
+
+#endif  /* __SUBSURFACE__ */
+
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
deleted file mode 100644
index a21e9b6a0b1..00000000000
--- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../kernel_compat_opencl.h"
-#include "../kernel_math.h"
-#include "../kernel_types.h"
-#include "../kernel_globals.h"
-
-/* Since we process various samples in parallel; The output radiance of different samples
- * are stored in different locations; This kernel combines the output radiance contributed
- * by all different samples and stores them in the RenderTile's output buffer.
- */
-ccl_device void kernel_sum_all_radiance(
-        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
-        ccl_global float *buffer,                    /* Output buffer of RenderTile */
-        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
-        int parallel_samples, int sw, int sh, int stride,
-        int buffer_offset_x,
-        int buffer_offset_y,
-        int buffer_stride,
-        int start_sample)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	if(x < sw && y < sh) {
-		buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride);
-		per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride);
-
-		int sample_stride = (data->film.pass_stride);
-
-		int sample_iterator = 0;
-		int pass_stride_iterator = 0;
-		int num_floats = data->film.pass_stride;
-
-		for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) {
-			for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
-				*(buffer + pass_stride_iterator) =
-				        (start_sample == 0 && sample_iterator == 0)
-				                ? *(per_sample_output_buffer + pass_stride_iterator)
-				                : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
-			}
-			per_sample_output_buffer += sample_stride;
-		}
-	}
-}
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 88ec7fe6fcc..d748e76fa80 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -39,7 +39,7 @@
  * mostly taken care of in the SVM compiler.
  */
 
-#include "svm_types.h"
+#include "kernel/svm/svm_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -139,49 +139,49 @@ CCL_NAMESPACE_END
 
 /* Nodes */
 
-#include "svm_noise.h"
+#include "kernel/svm/svm_noise.h"
 #include "svm_texture.h"
 
-#include "svm_color_util.h"
-#include "svm_math_util.h"
-
-#include "svm_attribute.h"
-#include "svm_gradient.h"
-#include "svm_blackbody.h"
-#include "svm_closure.h"
-#include "svm_noisetex.h"
-#include "svm_convert.h"
-#include "svm_displace.h"
-#include "svm_fresnel.h"
-#include "svm_wireframe.h"
-#include "svm_wavelength.h"
-#include "svm_camera.h"
-#include "svm_geometry.h"
-#include "svm_hsv.h"
-#include "svm_image.h"
-#include "svm_gamma.h"
-#include "svm_brightness.h"
-#include "svm_invert.h"
-#include "svm_light_path.h"
-#include "svm_magic.h"
-#include "svm_mapping.h"
-#include "svm_normal.h"
-#include "svm_wave.h"
-#include "svm_math.h"
-#include "svm_mix.h"
-#include "svm_ramp.h"
-#include "svm_sepcomb_hsv.h"
-#include "svm_sepcomb_vector.h"
-#include "svm_musgrave.h"
-#include "svm_sky.h"
-#include "svm_tex_coord.h"
-#include "svm_value.h"
-#include "svm_voronoi.h"
-#include "svm_checker.h"
-#include "svm_brick.h"
-#include "svm_vector_transform.h"
-#include "svm_voxel.h"
-#include "svm_bump.h"
+#include "kernel/svm/svm_color_util.h"
+#include "kernel/svm/svm_math_util.h"
+
+#include "kernel/svm/svm_attribute.h"
+#include "kernel/svm/svm_gradient.h"
+#include "kernel/svm/svm_blackbody.h"
+#include "kernel/svm/svm_closure.h"
+#include "kernel/svm/svm_noisetex.h"
+#include "kernel/svm/svm_convert.h"
+#include "kernel/svm/svm_displace.h"
+#include "kernel/svm/svm_fresnel.h"
+#include "kernel/svm/svm_wireframe.h"
+#include "kernel/svm/svm_wavelength.h"
+#include "kernel/svm/svm_camera.h"
+#include "kernel/svm/svm_geometry.h"
+#include "kernel/svm/svm_hsv.h"
+#include "kernel/svm/svm_image.h"
+#include "kernel/svm/svm_gamma.h"
+#include "kernel/svm/svm_brightness.h"
+#include "kernel/svm/svm_invert.h"
+#include "kernel/svm/svm_light_path.h"
+#include "kernel/svm/svm_magic.h"
+#include "kernel/svm/svm_mapping.h"
+#include "kernel/svm/svm_normal.h"
+#include "kernel/svm/svm_wave.h"
+#include "kernel/svm/svm_math.h"
+#include "kernel/svm/svm_mix.h"
+#include "kernel/svm/svm_ramp.h"
+#include "kernel/svm/svm_sepcomb_hsv.h"
+#include "kernel/svm/svm_sepcomb_vector.h"
+#include "kernel/svm/svm_musgrave.h"
+#include "kernel/svm/svm_sky.h"
+#include "kernel/svm/svm_tex_coord.h"
+#include "kernel/svm/svm_value.h"
+#include "kernel/svm/svm_voronoi.h"
+#include "kernel/svm/svm_checker.h"
+#include "kernel/svm/svm_brick.h"
+#include "kernel/svm/svm_vector_transform.h"
+#include "kernel/svm/svm_voxel.h"
+#include "kernel/svm/svm_bump.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -192,7 +192,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderType type, int path_flag)
 {
 	float stack[SVM_STACK_SIZE];
-	int offset = ccl_fetch(sd, shader) & SHADER_MASK;
+	int offset = sd->shader & SHADER_MASK;
 
 	while(1) {
 		uint4 node = read_node(kg, &offset);
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index 0e55c99ae97..229a3f20421 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -27,7 +27,7 @@ ccl_device AttributeDescriptor svm_node_attr_init(KernelGlobals *kg, ShaderData
 
 	AttributeDescriptor desc;
 
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
+	if(sd->object != OBJECT_NONE) {
 		desc = find_attribute(kg, sd, node.y);
 		if(desc.offset == ATTR_STD_NOT_FOUND) {
 			desc = attribute_not_found();
diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h
index b750ad87b7f..51590b18505 100644
--- a/intern/cycles/kernel/svm/svm_blackbody.h
+++ b/intern/cycles/kernel/svm/svm_blackbody.h
@@ -41,8 +41,7 @@ ccl_device void svm_node_blackbody(KernelGlobals *kg, ShaderData *sd, float *sta
 
 	float3 color_rgb = svm_math_blackbody_color(temperature);
 
-	if(stack_valid(col_offset))
-		stack_store_float3(stack, col_offset, color_rgb);
+	stack_store_float3(stack, col_offset, color_rgb);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h
index 04a8c7b64e5..610d9af9e1f 100644
--- a/intern/cycles/kernel/svm/svm_bump.h
+++ b/intern/cycles/kernel/svm/svm_bump.h
@@ -21,9 +21,9 @@ CCL_NAMESPACE_BEGIN
 ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset)
 {
 	/* save state */
-	stack_store_float3(stack, offset+0, ccl_fetch(sd, P));
-	stack_store_float3(stack, offset+3, ccl_fetch(sd, dP).dx);
-	stack_store_float3(stack, offset+6, ccl_fetch(sd, dP).dy);
+	stack_store_float3(stack, offset+0, sd->P);
+	stack_store_float3(stack, offset+3, sd->dP.dx);
+	stack_store_float3(stack, offset+6, sd->dP.dy);
 
 	/* set state as if undisplaced */
 	const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_POSITION_UNDISPLACED);
@@ -36,18 +36,18 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, floa
 		object_dir_transform(kg, sd, &dPdx);
 		object_dir_transform(kg, sd, &dPdy);
 
-		ccl_fetch(sd, P) = P;
-		ccl_fetch(sd, dP).dx = dPdx;
-		ccl_fetch(sd, dP).dy = dPdy;
+		sd->P = P;
+		sd->dP.dx = dPdx;
+		sd->dP.dy = dPdy;
 	}
 }
 
 ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset)
 {
 	/* restore state */
-	ccl_fetch(sd, P) = stack_load_float3(stack, offset+0);
-	ccl_fetch(sd, dP).dx = stack_load_float3(stack, offset+3);
-	ccl_fetch(sd, dP).dy = stack_load_float3(stack, offset+6);
+	sd->P = stack_load_float3(stack, offset+0);
+	sd->dP.dx = stack_load_float3(stack, offset+3);
+	sd->dP.dy = stack_load_float3(stack, offset+6);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index 00678a49d70..90249dfd978 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -23,7 +23,7 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack,
 	float3 vector;
 
 	Transform tfm = kernel_data.cam.worldtocamera;
-	vector = transform_point(&tfm, ccl_fetch(sd, P));
+	vector = transform_point(&tfm, sd->P);
 	zdepth = vector.z;
 	distance = len(vector);
 
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 017d697f9f8..4268813b263 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -25,13 +25,13 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 			bsdf->alpha_y = 0.0f;
 			bsdf->alpha_x = 0.0f;
 			bsdf->ior = eta;
-			ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf);
+			sd->flag |= bsdf_refraction_setup(bsdf);
 		}
 		else {
 			bsdf->alpha_y = 0.0f;
 			bsdf->alpha_x = 0.0f;
 			bsdf->ior = 0.0f;
-			ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf);
+			sd->flag |= bsdf_reflection_setup(bsdf);
 		}
 	}
 	else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) {
@@ -40,9 +40,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 		bsdf->ior = eta;
 
 		if(refract)
-			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+			sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
 		else
-			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf);
+			sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
 	}
 	else {
 		bsdf->alpha_x = roughness;
@@ -50,9 +50,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 		bsdf->ior = eta;
 
 		if(refract)
-			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+			sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
 		else
-			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf);
+			sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
 	}
 }
 
@@ -70,14 +70,353 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 	if(mix_weight == 0.0f)
 		return;
 
-	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N);
+	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N;
 
 	float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z);
 	float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
 
 	switch(type) {
+#ifdef __PRINCIPLED__
+		case CLOSURE_BSDF_PRINCIPLED_ID: {
+			uint specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset, sheen_offset,
+				sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset, eta_offset, transmission_offset,
+				anisotropic_rotation_offset, transmission_roughness_offset;
+			uint4 data_node2 = read_node(kg, offset);
+
+			float3 T = stack_load_float3(stack, data_node.y);
+			decode_node_uchar4(data_node.z, &specular_offset, &roughness_offset, &specular_tint_offset, &anisotropic_offset);
+			decode_node_uchar4(data_node.w, &sheen_offset, &sheen_tint_offset, &clearcoat_offset, &clearcoat_roughness_offset);
+			decode_node_uchar4(data_node2.x, &eta_offset, &transmission_offset, &anisotropic_rotation_offset, &transmission_roughness_offset);
+
+			// get Disney principled parameters
+			float metallic = param1;
+			float subsurface = param2;
+			float specular = stack_load_float(stack, specular_offset);
+			float roughness = stack_load_float(stack, roughness_offset);
+			float specular_tint = stack_load_float(stack, specular_tint_offset);
+			float anisotropic = stack_load_float(stack, anisotropic_offset);
+			float sheen = stack_load_float(stack, sheen_offset);
+			float sheen_tint = stack_load_float(stack, sheen_tint_offset);
+			float clearcoat = stack_load_float(stack, clearcoat_offset);
+			float clearcoat_roughness = stack_load_float(stack, clearcoat_roughness_offset);
+			float transmission = stack_load_float(stack, transmission_offset);
+			float anisotropic_rotation = stack_load_float(stack, anisotropic_rotation_offset);
+			float transmission_roughness = stack_load_float(stack, transmission_roughness_offset);
+			float eta = fmaxf(stack_load_float(stack, eta_offset), 1e-5f);
+
+			ClosureType distribution = stack_valid(data_node2.y) ? (ClosureType) data_node2.y : CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
+
+			/* rotate tangent */
+			if(anisotropic_rotation != 0.0f)
+				T = rotate_around_axis(T, N, anisotropic_rotation * M_2PI_F);
+
+			/* calculate ior */
+			float ior = (sd->flag & SD_BACKFACING) ? 1.0f / eta : eta;
+
+			// calculate fresnel for refraction
+			float cosNO = dot(N, sd->I);
+			float fresnel = fresnel_dielectric_cos(cosNO, ior);
+
+			// calculate weights of the diffuse and specular part
+			float diffuse_weight = (1.0f - saturate(metallic)) * (1.0f - saturate(transmission));
+			
+			float final_transmission = saturate(transmission) * (1.0f - saturate(metallic));
+			float specular_weight = (1.0f - final_transmission);
+
+			// get the base color
+			uint4 data_base_color = read_node(kg, offset);
+			float3 base_color = stack_valid(data_base_color.x) ? stack_load_float3(stack, data_base_color.x) :
+				make_float3(__uint_as_float(data_base_color.y), __uint_as_float(data_base_color.z), __uint_as_float(data_base_color.w));
+
+			// get the additional clearcoat normal and subsurface scattering radius
+			uint4 data_cn_ssr = read_node(kg, offset);
+			float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : sd->N;
+			float3 subsurface_radius = stack_valid(data_cn_ssr.y) ? stack_load_float3(stack, data_cn_ssr.y) : make_float3(1.0f, 1.0f, 1.0f);
+
+			// get the subsurface color
+			uint4 data_subsurface_color = read_node(kg, offset);
+			float3 subsurface_color = stack_valid(data_subsurface_color.x) ? stack_load_float3(stack, data_subsurface_color.x) :
+				make_float3(__uint_as_float(data_subsurface_color.y), __uint_as_float(data_subsurface_color.z), __uint_as_float(data_subsurface_color.w));
+
+			float3 weight = sd->svm_closure_weight * mix_weight;
+
+#ifdef __SUBSURFACE__
+			float3 mixed_ss_base_color = subsurface_color * subsurface + base_color * (1.0f - subsurface);
+			float3 subsurf_weight = weight * mixed_ss_base_color * diffuse_weight;
+			float subsurf_sample_weight = fabsf(average(subsurf_weight));
+
+			/* disable in case of diffuse ancestor, can't see it well then and
+			 * adds considerably noise due to probabilities of continuing path
+			 * getting lower and lower */
+			if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) {
+				subsurface = 0.0f;
+
+				/* need to set the base color in this case such that the
+				 * rays get the correctly mixed color after transmitting
+				 * the object */
+				base_color = mixed_ss_base_color;
+			}
+
+			/* diffuse */
+			if(fabsf(average(mixed_ss_base_color)) > CLOSURE_WEIGHT_CUTOFF) {
+				if(subsurface <= CLOSURE_WEIGHT_CUTOFF && diffuse_weight > CLOSURE_WEIGHT_CUTOFF) {
+					float3 diff_weight = weight * base_color * diffuse_weight;
+
+					PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight);
+
+					if(bsdf) {
+						bsdf->N = N;
+						bsdf->roughness = roughness;
+
+						/* setup bsdf */
+						sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+					}
+				}
+				else if(subsurface > CLOSURE_WEIGHT_CUTOFF && subsurf_sample_weight > CLOSURE_WEIGHT_CUTOFF) {
+					/* radius * scale */
+					float3 radius = subsurface_radius * subsurface;
+					/* sharpness */
+					float sharpness = 0.0f;
+					/* texture color blur */
+					float texture_blur = 0.0f;
+
+					/* create one closure per color channel */
+					Bssrdf *bssrdf = bssrdf_alloc(sd, make_float3(subsurf_weight.x, 0.0f, 0.0f));
+					if(bssrdf) {
+						bssrdf->sample_weight = subsurf_sample_weight;
+						bssrdf->radius = radius.x;
+						bssrdf->texture_blur = texture_blur;
+						bssrdf->albedo = subsurface_color.x;
+						bssrdf->sharpness = sharpness;
+						bssrdf->N = N;
+						bssrdf->roughness = roughness;
+
+						/* setup bsdf */
+						sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+					}
+
+					bssrdf = bssrdf_alloc(sd, make_float3(0.0f, subsurf_weight.y, 0.0f));
+					if(bssrdf) {
+						bssrdf->sample_weight = subsurf_sample_weight;
+						bssrdf->radius = radius.y;
+						bssrdf->texture_blur = texture_blur;
+						bssrdf->albedo = subsurface_color.y;
+						bssrdf->sharpness = sharpness;
+						bssrdf->N = N;
+						bssrdf->roughness = roughness;
+
+						/* setup bsdf */
+						sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+					}
+
+					bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, subsurf_weight.z));
+					if(bssrdf) {
+						bssrdf->sample_weight = subsurf_sample_weight;
+						bssrdf->radius = radius.z;
+						bssrdf->texture_blur = texture_blur;
+						bssrdf->albedo = subsurface_color.z;
+						bssrdf->sharpness = sharpness;
+						bssrdf->N = N;
+						bssrdf->roughness = roughness;
+
+						/* setup bsdf */
+						sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+					}
+				}
+			}
+#else
+			/* diffuse */
+			if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF) {
+				float3 diff_weight = weight * base_color * diffuse_weight;
+
+				PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf*)bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight);
+
+				if(bsdf) {
+					bsdf->N = N;
+					bsdf->roughness = roughness;
+
+					/* setup bsdf */
+					sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+				}
+			}
+#endif
+
+			/* sheen */
+			if(diffuse_weight > CLOSURE_WEIGHT_CUTOFF && sheen > CLOSURE_WEIGHT_CUTOFF) {
+				float m_cdlum = linear_rgb_to_gray(base_color);
+				float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(1.0f, 1.0f, 1.0f); // normalize lum. to isolate hue+sat
+
+				/* color of the sheen component */
+				float3 sheen_color = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - sheen_tint) + m_ctint * sheen_tint;
+
+				float3 sheen_weight = weight * sheen * sheen_color * diffuse_weight;
+
+				PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf*)bsdf_alloc(sd, sizeof(PrincipledSheenBsdf), sheen_weight);
+
+				if(bsdf) {
+					bsdf->N = N;
+
+					/* setup bsdf */
+					sd->flag |= bsdf_principled_sheen_setup(bsdf);
+				}
+			}
+
+			/* specular reflection */
+#ifdef __CAUSTICS_TRICKS__
+			if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+				if(specular_weight > CLOSURE_WEIGHT_CUTOFF && (specular > CLOSURE_WEIGHT_CUTOFF || metallic > CLOSURE_WEIGHT_CUTOFF)) {
+					float3 spec_weight = weight * specular_weight;
+
+					MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), spec_weight);
+					MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+					if(bsdf && extra) {
+						bsdf->N = N;
+						bsdf->ior = (2.0f / (1.0f - safe_sqrtf(0.08f * specular))) - 1.0f;
+						bsdf->T = T;
+						bsdf->extra = extra;
+
+						float aspect = safe_sqrtf(1.0f - anisotropic * 0.9f);
+						float r2 = roughness * roughness;
+
+						bsdf->alpha_x = r2 / aspect;
+						bsdf->alpha_y = r2 * aspect;
+
+						float m_cdlum = 0.3f * base_color.x + 0.6f * base_color.y + 0.1f * base_color.z; // luminance approx.
+						float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(0.0f, 0.0f, 0.0f); // normalize lum. to isolate hue+sat
+						float3 tmp_col = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint) + m_ctint * specular_tint;
+
+						bsdf->extra->cspec0 = (specular * 0.08f * tmp_col) * (1.0f - metallic) + base_color * metallic;
+						bsdf->extra->color = base_color;
+
+						/* setup bsdf */
+						if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */
+							sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf, sd);
+						else /* use multi-scatter GGX */
+							sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf, sd);
+					}
+				}
+#ifdef __CAUSTICS_TRICKS__
+			}
+#endif
+
+			/* BSDF */
+#ifdef __CAUSTICS_TRICKS__
+			if(kernel_data.integrator.caustics_reflective || kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+				if(final_transmission > CLOSURE_WEIGHT_CUTOFF) {
+					float3 glass_weight = weight * final_transmission;
+					float3 cspec0 = base_color * specular_tint + make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint);
+
+					if(roughness <= 5e-2f || distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) { /* use single-scatter GGX */
+						float refl_roughness = roughness;
+
+						/* reflection */
+#ifdef __CAUSTICS_TRICKS__
+						if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0)
+#endif
+						{
+							MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight*fresnel);
+							MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+							if(bsdf && extra) {
+								bsdf->N = N;
+								bsdf->extra = extra;
+
+								bsdf->alpha_x = refl_roughness * refl_roughness;
+								bsdf->alpha_y = refl_roughness * refl_roughness;
+								bsdf->ior = ior;
+
+								bsdf->extra->color = base_color;
+								bsdf->extra->cspec0 = cspec0;
+
+								/* setup bsdf */
+								sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
+							}
+						}
+
+						/* refraction */
+#ifdef __CAUSTICS_TRICKS__
+						if(kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0)
+#endif
+						{
+							MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), base_color*glass_weight*(1.0f - fresnel));
+
+							if(bsdf) {
+								bsdf->N = N;
+
+								if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID)
+									transmission_roughness = 1.0f - (1.0f - refl_roughness) * (1.0f - transmission_roughness);
+								else
+									transmission_roughness = refl_roughness;
+
+								bsdf->alpha_x = transmission_roughness * transmission_roughness;
+								bsdf->alpha_y = transmission_roughness * transmission_roughness;
+								bsdf->ior = ior;
+
+								/* setup bsdf */
+								sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+							}
+						}
+					}
+					else { /* use multi-scatter GGX */
+						MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), glass_weight);
+						MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+						if(bsdf && extra) {
+							bsdf->N = N;
+							bsdf->extra = extra;
+							bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+
+							bsdf->alpha_x = roughness * roughness;
+							bsdf->alpha_y = roughness * roughness;
+							bsdf->ior = ior;
+
+							bsdf->extra->color = base_color;
+							bsdf->extra->cspec0 = cspec0;
+
+							/* setup bsdf */
+							sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd);
+						}
+					}
+				}
+#ifdef __CAUSTICS_TRICKS__
+			}
+#endif
+
+			/* clearcoat */
+#ifdef __CAUSTICS_TRICKS__
+			if(kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
+#endif
+				if(clearcoat > CLOSURE_WEIGHT_CUTOFF) {
+					MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
+					MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+
+					if(bsdf && extra) {
+						bsdf->N = clearcoat_normal;
+						bsdf->ior = 1.5f;
+						bsdf->extra = extra;
+
+						bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness;
+						bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness;
+
+						bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f);
+						bsdf->extra->clearcoat = clearcoat;
+
+						/* setup bsdf */
+						sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
+					}
+				}
+#ifdef __CAUSTICS_TRICKS__
+			}
+#endif
+
+			break;
+		}
+#endif  /* __PRINCIPLED__ */
 		case CLOSURE_BSDF_DIFFUSE_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight);
 
 			if(bsdf) {
@@ -86,31 +425,32 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				float roughness = param1;
 
 				if(roughness == 0.0f) {
-					ccl_fetch(sd, flag) |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf);
+					sd->flag |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf);
 				}
 				else {
 					bsdf->roughness = roughness;
-					ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(bsdf);
+					sd->flag |= bsdf_oren_nayar_setup(bsdf);
 				}
 			}
 			break;
 		}
 		case CLOSURE_BSDF_TRANSLUCENT_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
 
 			if(bsdf) {
 				bsdf->N = N;
-				ccl_fetch(sd, flag) |= bsdf_translucent_setup(bsdf);
+				sd->flag |= bsdf_translucent_setup(bsdf);
 			}
 			break;
 		}
 		case CLOSURE_BSDF_TRANSPARENT_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
 
 			if(bsdf) {
-				ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf);
+				bsdf->N = N;
+				sd->flag |= bsdf_transparent_setup(bsdf);
 			}
 			break;
 		}
@@ -123,7 +463,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
 			if(bsdf) {
@@ -135,21 +475,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFLECTION_ID)
-					ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf);
+					sd->flag |= bsdf_reflection_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
-					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf);
+					sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
-					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf);
+					sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID) {
 					kernel_assert(stack_valid(data_node.z));
 					bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
 					if(bsdf->extra) {
 						bsdf->extra->color = stack_load_float3(stack, data_node.z);
-						ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_setup(bsdf);
+						sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
 					}
 				}
 				else
-					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(bsdf);
+					sd->flag |= bsdf_ashikhmin_shirley_setup(bsdf);
 			}
 
 			break;
@@ -161,7 +501,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
 			if(bsdf) {
@@ -169,7 +509,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->extra = NULL;
 
 				float eta = fmaxf(param2, 1e-5f);
-				eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+				eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFRACTION_ID) {
@@ -177,7 +517,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bsdf->alpha_y = 0.0f;
 					bsdf->ior = eta;
 
-					ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf);
+					sd->flag |= bsdf_refraction_setup(bsdf);
 				}
 				else {
 					bsdf->alpha_x = param1;
@@ -185,9 +525,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bsdf->ior = eta;
 
 					if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
-						ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+						sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
 					else
-						ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+						sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
 				}
 			}
 
@@ -203,14 +543,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				break;
 			}
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 
 			/* index of refraction */
 			float eta = fmaxf(param2, 1e-5f);
-			eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+			eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 			/* fresnel */
-			float cosNO = dot(N, ccl_fetch(sd, I));
+			float cosNO = dot(N, sd->I);
 			float fresnel = fresnel_dielectric_cos(cosNO, eta);
 			float roughness = param1;
 
@@ -249,7 +589,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 			MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
 
@@ -261,13 +601,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->alpha_x = param1;
 				bsdf->alpha_y = param1;
 				float eta = fmaxf(param2, 1e-5f);
-				bsdf->ior = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+				bsdf->ior = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 				kernel_assert(stack_valid(data_node.z));
 				bsdf->extra->color = stack_load_float3(stack, data_node.z);
 
 				/* setup bsdf */
-				ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
+				sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
 			}
 
 			break;
@@ -280,7 +620,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
 			if(bsdf) {
@@ -310,33 +650,33 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->ior = 0.0f;
 
 				if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) {
-					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(bsdf);
+					sd->flag |= bsdf_microfacet_beckmann_aniso_setup(bsdf);
 				}
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) {
-					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(bsdf);
+					sd->flag |= bsdf_microfacet_ggx_aniso_setup(bsdf);
 				}
 				else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID) {
 					kernel_assert(stack_valid(data_node.w));
 					bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
 					if(bsdf->extra) {
 						bsdf->extra->color = stack_load_float3(stack, data_node.w);
-						ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
+						sd->flag |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
 					}
 				}
 				else
-					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(bsdf);
+					sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(bsdf);
 			}
 			break;
 		}
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			VelvetBsdf *bsdf = (VelvetBsdf*)bsdf_alloc(sd, sizeof(VelvetBsdf), weight);
 
 			if(bsdf) {
 				bsdf->N = N;
 
 				bsdf->sigma = saturate(param1);
-				ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(bsdf);
+				sd->flag |= bsdf_ashikhmin_velvet_setup(bsdf);
 			}
 			break;
 		}
@@ -344,9 +684,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 #ifdef __CAUSTICS_TRICKS__
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
+			ATTR_FALLTHROUGH;
 #endif
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			ToonBsdf *bsdf = (ToonBsdf*)bsdf_alloc(sd, sizeof(ToonBsdf), weight);
 
 			if(bsdf) {
@@ -355,34 +696,36 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->smooth = param2;
 				
 				if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
-					ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(bsdf);
+					sd->flag |= bsdf_diffuse_toon_setup(bsdf);
 				else
-					ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(bsdf);
+					sd->flag |= bsdf_glossy_toon_setup(bsdf);
 			}
 			break;
 		}
 #ifdef __HAIR__
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			
-			if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+			if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) {
 				ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
 
 				if(bsdf) {
+					bsdf->N = N;
 					/* todo: giving a fixed weight here will cause issues when
 					 * mixing multiple BSDFS. energy will not be conserved and
 					 * the throughput can blow up after multiple bounces. we
 					 * better figure out a way to skip backfaces from rays
 					 * spawned by transmission from the front */
 					bsdf->weight = make_float3(1.0f, 1.0f, 1.0f);
-					ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf);
+					sd->flag |= bsdf_transparent_setup(bsdf);
 				}
 			}
 			else {
 				HairBsdf *bsdf = (HairBsdf*)bsdf_alloc(sd, sizeof(HairBsdf), weight);
 
 				if(bsdf) {
+					bsdf->N = N;
 					bsdf->roughness1 = param1;
 					bsdf->roughness2 = param2;
 					bsdf->offset = -stack_load_float(stack, data_node.z);
@@ -390,18 +733,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					if(stack_valid(data_node.y)) {
 						bsdf->T = normalize(stack_load_float3(stack, data_node.y));
 					}
-					else if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) {
-						bsdf->T = normalize(ccl_fetch(sd, dPdv));
+					else if(!(sd->type & PRIMITIVE_ALL_CURVE)) {
+						bsdf->T = normalize(sd->dPdv);
 						bsdf->offset = 0.0f;
 					}
 					else
-						bsdf->T = normalize(ccl_fetch(sd, dPdu));
+						bsdf->T = normalize(sd->dPdu);
 
 					if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
-						ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(bsdf);
+						sd->flag |= bsdf_hair_reflection_setup(bsdf);
 					}
 					else {
-						ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(bsdf);
+						sd->flag |= bsdf_hair_transmission_setup(bsdf);
 					}
 				}
 			}
@@ -414,8 +757,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSSRDF_CUBIC_ID:
 		case CLOSURE_BSSRDF_GAUSSIAN_ID:
 		case CLOSURE_BSSRDF_BURLEY_ID: {
-			float3 albedo = ccl_fetch(sd, svm_closure_weight);
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 albedo = sd->svm_closure_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			float sample_weight = fabsf(average(weight));
 			
 			/* disable in case of diffuse ancestor, can't see it well then and
@@ -441,7 +784,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bssrdf->albedo = albedo.x;
 					bssrdf->sharpness = sharpness;
 					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+					sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 
 				bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f));
@@ -452,7 +795,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bssrdf->albedo = albedo.y;
 					bssrdf->sharpness = sharpness;
 					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+					sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 
 				bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z));
@@ -463,7 +806,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bssrdf->albedo = albedo.z;
 					bssrdf->sharpness = sharpness;
 					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+					sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 			}
 
@@ -493,21 +836,21 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 
 	switch(type) {
 		case CLOSURE_VOLUME_ABSORPTION_ID: {
-			float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - ccl_fetch(sd, svm_closure_weight)) * mix_weight * density;
+			float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sd->svm_closure_weight) * mix_weight * density;
 			ShaderClosure *sc = closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_NONE_ID, weight);
 
 			if(sc) {
-				ccl_fetch(sd, flag) |= volume_absorption_setup(sc);
+				sd->flag |= volume_absorption_setup(sc);
 			}
 			break;
 		}
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight * density;
+			float3 weight = sd->svm_closure_weight * mix_weight * density;
 			HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), weight);
 
 			if(volume) {
 				volume->g = param2; /* g */
-				ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(volume);
+				sd->flag |= volume_henyey_greenstein_setup(volume);
 			}
 			break;
 		}
@@ -527,12 +870,12 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight);
 
-	ccl_fetch(sd, flag) |= SD_EMISSION;
+	sd->flag |= SD_EMISSION;
 }
 
 ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
@@ -545,10 +888,10 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight);
 }
 
 ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
@@ -561,12 +904,12 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight);
 
-	ccl_fetch(sd, flag) |= SD_HOLDOUT;
+	sd->flag |= SD_HOLDOUT;
 }
 
 ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node)
@@ -579,19 +922,19 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight);
 
-	ccl_fetch(sd, flag) |= SD_AO;
+	sd->flag |= SD_AO;
 }
 
 /* Closure Nodes */
 
 ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
 {
-	ccl_fetch(sd, svm_closure_weight) = weight;
+	sd->svm_closure_weight = weight;
 }
 
 ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
@@ -641,7 +984,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
 {
 	float3 normal = stack_load_float3(stack, in_direction);
-	ccl_fetch(sd, N) = normal;
+	sd->N = normal;
 	stack_store_float3(stack, out_normal, normal);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 890ab41aaaa..656357be52d 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -25,10 +25,10 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	uint normal_offset, distance_offset, invert, use_object_space;
 	decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, &use_object_space);
 
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
 
-	float3 dPdx = ccl_fetch(sd, dP).dx;
-	float3 dPdy = ccl_fetch(sd, dP).dy;
+	float3 dPdx = sd->dP.dx;
+	float3 dPdy = sd->dP.dy;
 
 	if(use_object_space) {
 		object_inverse_normal_transform(kg, sd, &normal_in);
@@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	strength = max(strength, 0.0f);
 
 	/* compute and output perturbed normal */
-	float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad);
-	normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+	float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad);
+	if(is_zero(normal_out)) {
+		normal_out = normal_in;
+	}
+	else {
+		normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+	}
 
 	if(use_object_space) {
 		object_normal_transform(kg, sd, &normal_out);
@@ -80,14 +85,14 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg, ShaderData *sd, flo
 {
 	float d = stack_load_float(stack, fac_offset);
 
-	float3 dP = ccl_fetch(sd, N);
+	float3 dP = sd->N;
 	object_inverse_normal_transform(kg, sd, &dP);
 
 	dP *= d*0.1f; /* todo: get rid of this factor */
 
 	object_dir_transform(kg, sd, &dP);
 
-	ccl_fetch(sd, P) += dP;
+	sd->P += dP;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 23c97d80cb0..3703ec55015 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset,
 	uint normal_offset, out_offset;
 	decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL);
 	float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value);
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
 	
 	eta = fmaxf(eta, 1e-5f);
-	eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+	eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
-	float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
+	float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
 
 	stack_store_float(stack, out_offset, f);
 }
@@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
 	decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL);
 
 	float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value);
-	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N;
 
 	float f;
 
 	if(type == NODE_LAYER_WEIGHT_FRESNEL) {
 		float eta = fmaxf(1.0f - blend, 1e-5f);
-		eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta;
+		eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta;
 
-		f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
+		f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
 	}
 	else {
-		f = fabsf(dot(ccl_fetch(sd, I), normal_in));
+		f = fabsf(dot(sd->I, normal_in));
 
 		if(blend != 0.5f) {
 			blend = clamp(blend, 0.0f, 1.0f-1e-5f);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 7d512f7ff4d..cce4e89e715 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -27,16 +27,17 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P); break;
-		case NODE_GEOM_N: data = ccl_fetch(sd, N); break;
+		case NODE_GEOM_P: data = sd->P; break;
+		case NODE_GEOM_N: data = sd->N; break;
 #ifdef __DPDU__
 		case NODE_GEOM_T: data = primitive_tangent(kg, sd); break;
 #endif
-		case NODE_GEOM_I: data = ccl_fetch(sd, I); break;
-		case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break;
+		case NODE_GEOM_I: data = sd->I; break;
+		case NODE_GEOM_Ng: data = sd->Ng; break;
 #ifdef __UV__
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break;
+		case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
 #endif
+		default: data = make_float3(0.0f, 0.0f, 0.0f);
 	}
 
 	stack_store_float3(stack, out_offset, data);
@@ -48,8 +49,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break;
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break;
+		case NODE_GEOM_P: data = sd->P + sd->dP.dx; break;
+		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -65,8 +66,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break;
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break;
+		case NODE_GEOM_P: data = sd->P + sd->dP.dy; break;
+		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -87,9 +88,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s
 			stack_store_float3(stack, out_offset, object_location(kg, sd));
 			return;
 		}
-		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break;
+		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break;
 		case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break;
-		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break;
+		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break;
 		default: data = 0.0f; break;
 	}
 
@@ -106,44 +107,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg,
 {
 	switch(type) {
 		case NODE_INFO_PAR_INDEX: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_index(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_AGE: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_age(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LIFETIME: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LOCATION: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_location(kg, particle_id));
 			break;
 		}
 #if 0	/* XXX float4 currently not supported in SVM stack */
 		case NODE_INFO_PAR_ROTATION: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id));
 			break;
 		}
 #endif
 		case NODE_INFO_PAR_SIZE: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_size(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_ANGULAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id));
 			break;
 		}
@@ -165,7 +166,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_INFO_CURVE_IS_STRAND: {
-			data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0;
+			data = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}
@@ -177,7 +178,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg,
 			break;
 		}
 		/*case NODE_INFO_CURVE_FADE: {
-			data = ccl_fetch(sd, curve_transparency);
+			data = sd->curve_transparency;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}*/
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 2afdf61b476..4226e7adfe0 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -16,186 +16,25 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Float4 textures on various devices. */
-#if defined(__KERNEL_CPU__)
-#  define TEX_NUM_FLOAT4_IMAGES		TEX_NUM_FLOAT4_CPU
-#elif defined(__KERNEL_CUDA__)
-#  if __CUDA_ARCH__ < 300
-#    define TEX_NUM_FLOAT4_IMAGES	TEX_NUM_FLOAT4_CUDA
-#  else
-#    define TEX_NUM_FLOAT4_IMAGES	TEX_NUM_FLOAT4_CUDA_KEPLER
-#  endif
-#else
-#  define TEX_NUM_FLOAT4_IMAGES	TEX_NUM_FLOAT4_OPENCL
-#endif
-
 ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
 {
-#ifdef __KERNEL_CPU__
-#  ifdef __KERNEL_SSE2__
-	ssef r_ssef;
-	float4 &r = (float4 &)r_ssef;
-	r = kernel_tex_image_interp(id, x, y);
-#  else
-	float4 r = kernel_tex_image_interp(id, x, y);
-#  endif
-#elif defined(__KERNEL_OPENCL__)
 	float4 r = kernel_tex_image_interp(kg, id, x, y);
-#else
-	float4 r;
-
-#  if __CUDA_ARCH__ < 300
-	/* not particularly proud of this massive switch, what are the
-	 * alternatives?
-	 * - use a single big 1D texture, and do our own lookup/filtering
-	 * - group by size and use a 3d texture, performance impact
-	 * - group into larger texture with some padding for correct lerp
-	 *
-	 * also note that cuda has a textures limit (128 for Fermi, 256 for Kepler),
-	 * and we cannot use all since we still need some for other storage */
-
-	switch(id) {
-		case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break;
-		case 1: r = kernel_tex_image_interp(__tex_image_float4_001, x, y); break;
-		case 2: r = kernel_tex_image_interp(__tex_image_float4_002, x, y); break;
-		case 3: r = kernel_tex_image_interp(__tex_image_float4_003, x, y); break;
-		case 4: r = kernel_tex_image_interp(__tex_image_float4_004, x, y); break;
-		case 5: r = kernel_tex_image_interp(__tex_image_byte4_005, x, y); break;
-		case 6: r = kernel_tex_image_interp(__tex_image_byte4_006, x, y); break;
-		case 7: r = kernel_tex_image_interp(__tex_image_byte4_007, x, y); break;
-		case 8: r = kernel_tex_image_interp(__tex_image_byte4_008, x, y); break;
-		case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break;
-		case 10: r = kernel_tex_image_interp(__tex_image_byte4_010, x, y); break;
-		case 11: r = kernel_tex_image_interp(__tex_image_byte4_011, x, y); break;
-		case 12: r = kernel_tex_image_interp(__tex_image_byte4_012, x, y); break;
-		case 13: r = kernel_tex_image_interp(__tex_image_byte4_013, x, y); break;
-		case 14: r = kernel_tex_image_interp(__tex_image_byte4_014, x, y); break;
-		case 15: r = kernel_tex_image_interp(__tex_image_byte4_015, x, y); break;
-		case 16: r = kernel_tex_image_interp(__tex_image_byte4_016, x, y); break;
-		case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break;
-		case 18: r = kernel_tex_image_interp(__tex_image_byte4_018, x, y); break;
-		case 19: r = kernel_tex_image_interp(__tex_image_byte4_019, x, y); break;
-		case 20: r = kernel_tex_image_interp(__tex_image_byte4_020, x, y); break;
-		case 21: r = kernel_tex_image_interp(__tex_image_byte4_021, x, y); break;
-		case 22: r = kernel_tex_image_interp(__tex_image_byte4_022, x, y); break;
-		case 23: r = kernel_tex_image_interp(__tex_image_byte4_023, x, y); break;
-		case 24: r = kernel_tex_image_interp(__tex_image_byte4_024, x, y); break;
-		case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break;
-		case 26: r = kernel_tex_image_interp(__tex_image_byte4_026, x, y); break;
-		case 27: r = kernel_tex_image_interp(__tex_image_byte4_027, x, y); break;
-		case 28: r = kernel_tex_image_interp(__tex_image_byte4_028, x, y); break;
-		case 29: r = kernel_tex_image_interp(__tex_image_byte4_029, x, y); break;
-		case 30: r = kernel_tex_image_interp(__tex_image_byte4_030, x, y); break;
-		case 31: r = kernel_tex_image_interp(__tex_image_byte4_031, x, y); break;
-		case 32: r = kernel_tex_image_interp(__tex_image_byte4_032, x, y); break;
-		case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break;
-		case 34: r = kernel_tex_image_interp(__tex_image_byte4_034, x, y); break;
-		case 35: r = kernel_tex_image_interp(__tex_image_byte4_035, x, y); break;
-		case 36: r = kernel_tex_image_interp(__tex_image_byte4_036, x, y); break;
-		case 37: r = kernel_tex_image_interp(__tex_image_byte4_037, x, y); break;
-		case 38: r = kernel_tex_image_interp(__tex_image_byte4_038, x, y); break;
-		case 39: r = kernel_tex_image_interp(__tex_image_byte4_039, x, y); break;
-		case 40: r = kernel_tex_image_interp(__tex_image_byte4_040, x, y); break;
-		case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break;
-		case 42: r = kernel_tex_image_interp(__tex_image_byte4_042, x, y); break;
-		case 43: r = kernel_tex_image_interp(__tex_image_byte4_043, x, y); break;
-		case 44: r = kernel_tex_image_interp(__tex_image_byte4_044, x, y); break;
-		case 45: r = kernel_tex_image_interp(__tex_image_byte4_045, x, y); break;
-		case 46: r = kernel_tex_image_interp(__tex_image_byte4_046, x, y); break;
-		case 47: r = kernel_tex_image_interp(__tex_image_byte4_047, x, y); break;
-		case 48: r = kernel_tex_image_interp(__tex_image_byte4_048, x, y); break;
-		case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break;
-		case 50: r = kernel_tex_image_interp(__tex_image_byte4_050, x, y); break;
-		case 51: r = kernel_tex_image_interp(__tex_image_byte4_051, x, y); break;
-		case 52: r = kernel_tex_image_interp(__tex_image_byte4_052, x, y); break;
-		case 53: r = kernel_tex_image_interp(__tex_image_byte4_053, x, y); break;
-		case 54: r = kernel_tex_image_interp(__tex_image_byte4_054, x, y); break;
-		case 55: r = kernel_tex_image_interp(__tex_image_byte4_055, x, y); break;
-		case 56: r = kernel_tex_image_interp(__tex_image_byte4_056, x, y); break;
-		case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break;
-		case 58: r = kernel_tex_image_interp(__tex_image_byte4_058, x, y); break;
-		case 59: r = kernel_tex_image_interp(__tex_image_byte4_059, x, y); break;
-		case 60: r = kernel_tex_image_interp(__tex_image_byte4_060, x, y); break;
-		case 61: r = kernel_tex_image_interp(__tex_image_byte4_061, x, y); break;
-		case 62: r = kernel_tex_image_interp(__tex_image_byte4_062, x, y); break;
-		case 63: r = kernel_tex_image_interp(__tex_image_byte4_063, x, y); break;
-		case 64: r = kernel_tex_image_interp(__tex_image_byte4_064, x, y); break;
-		case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break;
-		case 66: r = kernel_tex_image_interp(__tex_image_byte4_066, x, y); break;
-		case 67: r = kernel_tex_image_interp(__tex_image_byte4_067, x, y); break;
-		case 68: r = kernel_tex_image_interp(__tex_image_byte4_068, x, y); break;
-		case 69: r = kernel_tex_image_interp(__tex_image_byte4_069, x, y); break;
-		case 70: r = kernel_tex_image_interp(__tex_image_byte4_070, x, y); break;
-		case 71: r = kernel_tex_image_interp(__tex_image_byte4_071, x, y); break;
-		case 72: r = kernel_tex_image_interp(__tex_image_byte4_072, x, y); break;
-		case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break;
-		case 74: r = kernel_tex_image_interp(__tex_image_byte4_074, x, y); break;
-		case 75: r = kernel_tex_image_interp(__tex_image_byte4_075, x, y); break;
-		case 76: r = kernel_tex_image_interp(__tex_image_byte4_076, x, y); break;
-		case 77: r = kernel_tex_image_interp(__tex_image_byte4_077, x, y); break;
-		case 78: r = kernel_tex_image_interp(__tex_image_byte4_078, x, y); break;
-		case 79: r = kernel_tex_image_interp(__tex_image_byte4_079, x, y); break;
-		case 80: r = kernel_tex_image_interp(__tex_image_byte4_080, x, y); break;
-		case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break;
-		case 82: r = kernel_tex_image_interp(__tex_image_byte4_082, x, y); break;
-		case 83: r = kernel_tex_image_interp(__tex_image_byte4_083, x, y); break;
-		case 84: r = kernel_tex_image_interp(__tex_image_byte4_084, x, y); break;
-		case 85: r = kernel_tex_image_interp(__tex_image_byte4_085, x, y); break;
-		case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break;
-		case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break;
-		case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break;
-		case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break;
-		default:
-			kernel_assert(0);
-			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-	}
-#  else
-	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
-	/* float4, byte4 and half4 */
-	if(id < TEX_START_FLOAT_CUDA_KEPLER)
-		r = kernel_tex_image_interp_float4(tex, x, y);
-	/* float, byte and half */
-	else {
-		float f = kernel_tex_image_interp_float(tex, x, y);
-		r = make_float4(f, f, f, 1.0f);
-	}
-#  endif
-#endif
-
-#ifdef __KERNEL_SSE2__
-	float alpha = r.w;
+	const float alpha = r.w;
 
 	if(use_alpha && alpha != 1.0f && alpha != 0.0f) {
-		r_ssef = r_ssef / ssef(alpha);
-		if(id >= TEX_NUM_FLOAT4_IMAGES)
-			r_ssef = min(r_ssef, ssef(1.0f));
-		r.w = alpha;
-	}
-
-	if(srgb) {
-		r_ssef = color_srgb_to_scene_linear(r_ssef);
-		r.w = alpha;
-	}
-#else
-	if(use_alpha && r.w != 1.0f && r.w != 0.0f) {
-		float invw = 1.0f/r.w;
-		r.x *= invw;
-		r.y *= invw;
-		r.z *= invw;
-
-		if(id >= TEX_NUM_FLOAT4_IMAGES) {
-			r.x = min(r.x, 1.0f);
-			r.y = min(r.y, 1.0f);
-			r.z = min(r.z, 1.0f);
+		r /= alpha;
+		const int texture_type = kernel_tex_type(id);
+		if(texture_type == IMAGE_DATA_TYPE_BYTE4 ||
+		   texture_type == IMAGE_DATA_TYPE_BYTE)
+		{
+			r = min(r, make_float4(1.0f, 1.0f, 1.0f, 1.0f));
 		}
+		r.w = alpha;
 	}
 
 	if(srgb) {
-		r.x = color_srgb_to_scene_linear(r.x);
-		r.y = color_srgb_to_scene_linear(r.y);
-		r.z = color_srgb_to_scene_linear(r.z);
+		r = color_srgb_to_scene_linear_v4(r);
 	}
-#endif
 
 	return r;
 }
@@ -238,12 +77,14 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta
 ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	/* get object space normal */
-	float3 N = ccl_fetch(sd, N);
+	float3 N = sd->N;
 
-	N = ccl_fetch(sd, N);
+	N = sd->N;
 	object_inverse_normal_transform(kg, sd, &N);
 
 	/* project from direction vector to barycentric coordinates in triangles */
+	float3 signed_N = N;
+
 	N.x = fabsf(N.x);
 	N.y = fabsf(N.y);
 	N.z = fabsf(N.z);
@@ -313,12 +154,19 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
 	float4 f = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 	uint use_alpha = stack_valid(alpha_offset);
 
-	if(weight.x > 0.0f)
-		f += weight.x*svm_image_texture(kg, id, co.y, co.z, srgb, use_alpha);
-	if(weight.y > 0.0f)
-		f += weight.y*svm_image_texture(kg, id, co.x, co.z, srgb, use_alpha);
-	if(weight.z > 0.0f)
-		f += weight.z*svm_image_texture(kg, id, co.y, co.x, srgb, use_alpha);
+	/* Map so that no textures are flipped, rotation is somewhat arbitrary. */
+	if(weight.x > 0.0f) {
+		float2 uv = make_float2((signed_N.x < 0.0f)? 1.0f - co.y: co.y, co.z);
+		f += weight.x*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+	}
+	if(weight.y > 0.0f) {
+		float2 uv = make_float2((signed_N.y > 0.0f)? 1.0f - co.x: co.x, co.z);
+		f += weight.y*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+	}
+	if(weight.z > 0.0f) {
+		float2 uv = make_float2((signed_N.z > 0.0f)? 1.0f - co.y: co.y, co.x);
+		f += weight.z*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+	}
 
 	if(stack_valid(out_offset))
 		stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
@@ -337,8 +185,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa
 	float3 co = stack_load_float3(stack, co_offset);
 	float2 uv;
 
-	co = normalize(co);
-	
+	co = safe_normalize(co);
+
 	if(projection == 0)
 		uv = direction_to_equirectangular(co);
 	else
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index f35ea05048b..1492e358608 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -31,9 +31,11 @@ ccl_device void svm_node_light_path(ShaderData *sd, ccl_addr_space PathState *st
 		case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break;
 		case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break;
 		case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break;
-		case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break;
-		case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break;
+		case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break;
+		case NODE_LP_ray_length: info = sd->ray_length; break;
 		case NODE_LP_ray_depth: info = (float)state->bounce; break;
+		case NODE_LP_ray_diffuse: info = (float)state->diffuse_bounce; break;
+		case NODE_LP_ray_glossy: info = (float)state->glossy_bounce; break;
 		case NODE_LP_ray_transparent: info = (float)state->transparent_bounce; break;
 		case NODE_LP_ray_transmission: info = (float)state->transmission_bounce; break;
 	}
@@ -54,14 +56,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
 
 	switch(type) {
 		case NODE_LIGHT_FALLOFF_QUADRATIC: break;
-		case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break;
-		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break;
+		case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break;
+		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break;
 	}
 
 	float smooth = stack_load_float(stack, smooth_offset);
 
 	if(smooth > 0.0f) {
-		float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length);
+		float squared = sd->ray_length*sd->ray_length;
 		/* Distant lamps set the ray length to FLT_MAX, which causes squared to overflow. */
 		if(isfinite(squared)) {
 			strength *= squared/(smooth + squared);
diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h
index 01547b60014..1ce7777aac3 100644
--- a/intern/cycles/kernel/svm/svm_math_util.h
+++ b/intern/cycles/kernel/svm/svm_math_util.h
@@ -100,66 +100,64 @@ ccl_device float svm_math(NodeMath type, float Fac1, float Fac2)
 	return Fac;
 }
 
-ccl_device float3 svm_math_blackbody_color(float t) {
-	/* Calculate color in range 800..12000 using an approximation
-	 * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B
-	 * Max absolute error for RGB is (0.00095, 0.00077, 0.00057),
-	 * which is enough to get the same 8 bit/channel color.
-	 */
-
-	const float rc[6][3] = {
-		{  2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f },
-		{  3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f },
-		{  4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f },
-		{  4.66849800e+03f,  2.85655028e-05f, 1.29075375e-01f },
-		{  4.60124770e+03f,  2.89727618e-05f, 1.48001316e-01f },
-		{  3.78765709e+03f,  9.36026367e-06f, 3.98995841e-01f },
-	};
-
-	const float gc[6][3] = {
-		{ -7.50343014e+02f,  3.15679613e-04f, 4.73464526e-01f },
-		{ -1.00402363e+03f,  1.29189794e-04f, 9.08181524e-01f },
-		{ -1.22075471e+03f,  2.56245413e-05f, 1.20753416e+00f },
-		{ -1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f },
-		{ -1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f },
-		{ -5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f },
-	};
-
-	const float bc[6][4] = {
-		{ 0.0f, 0.0f, 0.0f, 0.0f }, /* zeros should be optimized by compiler */
-		{ 0.0f, 0.0f, 0.0f, 0.0f },
-		{ 0.0f, 0.0f, 0.0f, 0.0f },
-		{ -2.02524603e-11f,  1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f },
-		{ -2.22463426e-13f, -1.55078698e-08f,  3.81675160e-04f, -7.30646033e-01f },
-		{  6.72595954e-13f, -2.73059993e-08f,  4.24068546e-04f, -7.52204323e-01f },
-	};
-
-	if(t >= 12000.0f)
+/* Calculate color in range 800..12000 using an approximation
+ * a/x+bx+c for R and G and ((at + b)t + c)t + d) for B
+ * Max absolute error for RGB is (0.00095, 0.00077, 0.00057),
+ * which is enough to get the same 8 bit/channel color.
+ */
+
+ccl_static_constant float blackbody_table_r[6][3] = {
+	{  2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f },
+	{  3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f },
+	{  4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f },
+	{  4.66849800e+03f,  2.85655028e-05f, 1.29075375e-01f },
+	{  4.60124770e+03f,  2.89727618e-05f, 1.48001316e-01f },
+	{  3.78765709e+03f,  9.36026367e-06f, 3.98995841e-01f },
+};
+
+ccl_static_constant float blackbody_table_g[6][3] = {
+	{ -7.50343014e+02f,  3.15679613e-04f, 4.73464526e-01f },
+	{ -1.00402363e+03f,  1.29189794e-04f, 9.08181524e-01f },
+	{ -1.22075471e+03f,  2.56245413e-05f, 1.20753416e+00f },
+	{ -1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f },
+	{ -1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f },
+	{ -5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f },
+};
+
+ccl_static_constant float blackbody_table_b[6][4] = {
+	{ 0.0f, 0.0f, 0.0f, 0.0f }, /* zeros should be optimized by compiler */
+	{ 0.0f, 0.0f, 0.0f, 0.0f },
+	{ 0.0f, 0.0f, 0.0f, 0.0f },
+	{ -2.02524603e-11f,  1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f },
+	{ -2.22463426e-13f, -1.55078698e-08f,  3.81675160e-04f, -7.30646033e-01f },
+	{  6.72595954e-13f, -2.73059993e-08f,  4.24068546e-04f, -7.52204323e-01f },
+};
+
+
+ccl_device float3 svm_math_blackbody_color(float t)
+{
+	if(t >= 12000.0f) {
 		return make_float3(0.826270103f, 0.994478524f, 1.56626022f);
+	}
+	else if(t < 965.0f) {
+		/* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */
+		return make_float3(4.70366907f, 0.0f, 0.0f);
+	}
+
+	int i = (t >= 6365.0f)? 5:
+		(t >= 3315.0f)? 4:
+		(t >= 1902.0f)? 3:
+		(t >= 1449.0f)? 2:
+		(t >= 1167.0f)? 1: 0;
+
+	ccl_constant float *r = blackbody_table_r[i];
+	ccl_constant float *g = blackbody_table_g[i];
+	ccl_constant float *b = blackbody_table_b[i];
 
-	/* Define a macro to reduce stack usage for nvcc */
-#define MAKE_BB_RGB(i) make_float3(\
-		rc[i][0] / t + rc[i][1] * t + rc[i][2],\
-		gc[i][0] / t + gc[i][1] * t + gc[i][2],\
-		((bc[i][0] * t + bc[i][1]) * t + bc[i][2]) * t + bc[i][3])
-
-	if(t >= 6365.0f)
-		return MAKE_BB_RGB(5);
-	if(t >= 3315.0f)
-		return MAKE_BB_RGB(4);
-	if(t >= 1902.0f)
-		return MAKE_BB_RGB(3);
-	if(t >= 1449.0f)
-		return MAKE_BB_RGB(2);
-	if(t >= 1167.0f)
-		return MAKE_BB_RGB(1);
-	if(t >= 965.0f)
-		return MAKE_BB_RGB(0);
-
-#undef MAKE_BB_RGB
-
-	/* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */
-	return make_float3(4.70366907f, 0.0f, 0.0f);
+	const float t_inv = 1.0f / t;
+	return make_float3(r[0] * t_inv + r[1] * t + r[2],
+	                   g[0] * t_inv + g[1] * t + g[2],
+	                   ((b[0] * t + b[1]) * t + b[2]) * t + b[3]);
 }
 
 ccl_device_inline float3 svm_math_gamma_color(float3 color, float gamma)
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index 62ff38cf1c5..0347ab7b193 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -18,50 +18,42 @@ CCL_NAMESPACE_BEGIN
 
 /* Noise */
 
-ccl_device_inline void svm_noise(float3 p, float detail, float distortion, float *fac, float3 *color)
-{
-	int hard = 0;
-
-	if(distortion != 0.0f) {
-		float3 r, offset = make_float3(13.5f, 13.5f, 13.5f);
-
-		r.x = noise(p + offset) * distortion;
-		r.y = noise(p) * distortion;
-		r.z = noise(p - offset) * distortion;
-
-		p += r;
-	}
-
-	*fac = noise_turbulence(p, detail, hard);
-	*color = make_float3(*fac,
-		noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard),
-		noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard));
-}
-
 ccl_device void svm_node_tex_noise(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
 	uint co_offset, scale_offset, detail_offset, distortion_offset, fac_offset, color_offset;
 
 	decode_node_uchar4(node.y, &co_offset, &scale_offset, &detail_offset, &distortion_offset);
+	decode_node_uchar4(node.z, &color_offset, &fac_offset, NULL, NULL);
 
 	uint4 node2 = read_node(kg, offset);
 
 	float scale = stack_load_float_default(stack, scale_offset, node2.x);
 	float detail = stack_load_float_default(stack, detail_offset, node2.y);
 	float distortion = stack_load_float_default(stack, distortion_offset, node2.z);
-	float3 co = stack_load_float3(stack, co_offset);
+	float3 p = stack_load_float3(stack, co_offset) * scale;
+	int hard = 0;
 
-	float3 color;
-	float f;
+	if(distortion != 0.0f) {
+		float3 r, offset = make_float3(13.5f, 13.5f, 13.5f);
+
+		r.x = noise(p + offset) * distortion;
+		r.y = noise(p) * distortion;
+		r.z = noise(p - offset) * distortion;
 
-	svm_noise(co*scale, detail, distortion, &f, &color);
+		p += r;
+	}
 
-	decode_node_uchar4(node.z, &color_offset, &fac_offset, NULL, NULL);
+	float f = noise_turbulence(p, detail, hard);
 
-	if(stack_valid(fac_offset))
+	if(stack_valid(fac_offset)) {
 		stack_store_float(stack, fac_offset, f);
-	if(stack_valid(color_offset))
+	}
+	if(stack_valid(color_offset)) {
+		float3 color = make_float3(f,
+			noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard),
+			noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard));
 		stack_store_float3(stack, color_offset, color);
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index c0b01262212..c94327401f5 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P);
+			data = sd->P;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -48,47 +48,47 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P));
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg));
+				data = transform_point(&tfm, sd->P + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P));
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P));
+				data = camera_world_to_ndc(kg, sd, sd->P);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P);
+			data = sd->P;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -112,9 +112,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
+			data = sd->P + sd->dP.dx;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -129,47 +129,47 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P + sd->dP.dx);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg));
+				data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx);
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
+				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
+			data = sd->P + sd->dP.dx;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -196,9 +196,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
+			data = sd->P + sd->dP.dy;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -213,47 +213,47 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P + sd->dP.dy);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg));
+				data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy);
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
+				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
+			data = sd->P + sd->dP.dy;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -274,12 +274,12 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 	float3 color = stack_load_float3(stack, color_offset);
 	color = 2.0f*make_float3(color.x - 0.5f, color.y - 0.5f, color.z - 0.5f);
 
-	bool is_backfacing = (ccl_fetch(sd, flag) & SD_BACKFACING) != 0;
+	bool is_backfacing = (sd->flag & SD_BACKFACING) != 0;
 	float3 N;
 
 	if(space == NODE_NORMAL_MAP_TANGENT) {
 		/* tangent space */
-		if(ccl_fetch(sd, object) == OBJECT_NONE) {
+		if(sd->object == OBJECT_NONE) {
 			stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f));
 			return;
 		}
@@ -299,11 +299,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 		float sign = primitive_attribute_float(kg, sd, attr_sign, NULL, NULL);
 		float3 normal;
 
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+		if(sd->shader & SHADER_SMOOTH_NORMAL) {
 			normal = primitive_attribute_float3(kg, sd, attr_normal, NULL, NULL);
 		}
 		else {
-			normal = ccl_fetch(sd, Ng);
+			normal = sd->Ng;
 
 			/* the normal is already inverted, which is too soon for the math here */
 			if(is_backfacing) {
@@ -345,11 +345,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 
 	if(strength != 1.0f) {
 		strength = max(strength, 0.0f);
-		N = safe_normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength);
+		N = safe_normalize(sd->N + (N - sd->N)*strength);
 	}
 
 	if(is_zero(N)) {
-		N = ccl_fetch(sd, N);
+		N = sd->N;
 	}
 
 	stack_store_float3(stack, normal_offset, N);
@@ -377,7 +377,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 		float3 generated;
 
 		if(desc.offset == ATTR_STD_NOT_FOUND)
-			generated = ccl_fetch(sd, P);
+			generated = sd->P;
 		else
 			generated = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
@@ -390,7 +390,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 	}
 
 	object_normal_transform(kg, sd, &tangent);
-	tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N))));
+	tangent = cross(sd->N, normalize(cross(tangent, sd->N)));
 	stack_store_float3(stack, tangent_offset, tangent);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 5adf7d34f7f..d859cae1708 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -188,6 +188,8 @@ typedef enum NodeLightPath {
 	NODE_LP_backfacing,
 	NODE_LP_ray_length,
 	NODE_LP_ray_depth,
+	NODE_LP_ray_diffuse,
+	NODE_LP_ray_glossy,
 	NODE_LP_ray_transparent,
 	NODE_LP_ray_transmission,
 } NodeLightPath;
@@ -395,17 +397,23 @@ typedef enum ClosureType {
 	CLOSURE_BSDF_DIFFUSE_ID,
 	CLOSURE_BSDF_OREN_NAYAR_ID,
 	CLOSURE_BSDF_DIFFUSE_RAMP_ID,
+	CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID,
+	CLOSURE_BSDF_PRINCIPLED_SHEEN_ID,
 	CLOSURE_BSDF_DIFFUSE_TOON_ID,
 
 	/* Glossy */
-	CLOSURE_BSDF_GLOSSY_ID,
 	CLOSURE_BSDF_REFLECTION_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_ID,
+	CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID,
+	CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_ID,
 	CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID,
+	CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID,
 	CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID,
+	CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID,
 	CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID,
+	CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_FRESNEL_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID,
 	CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID,
 	CLOSURE_BSDF_ASHIKHMIN_VELVET_ID,
@@ -414,24 +422,26 @@ typedef enum ClosureType {
 	CLOSURE_BSDF_HAIR_REFLECTION_ID,
 
 	/* Transmission */
-	CLOSURE_BSDF_TRANSMISSION_ID,
 	CLOSURE_BSDF_TRANSLUCENT_ID,
 	CLOSURE_BSDF_REFRACTION_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
+	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID,
-	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
+	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID,
 	CLOSURE_BSDF_SHARP_GLASS_ID,
 	CLOSURE_BSDF_HAIR_TRANSMISSION_ID,
 
 	/* Special cases */
 	CLOSURE_BSDF_BSSRDF_ID,
+	CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID,
 	CLOSURE_BSDF_TRANSPARENT_ID,
 
 	/* BSSRDF */
 	CLOSURE_BSSRDF_CUBIC_ID,
 	CLOSURE_BSSRDF_GAUSSIAN_ID,
+	CLOSURE_BSSRDF_PRINCIPLED_ID,
 	CLOSURE_BSSRDF_BURLEY_ID,
 
 	/* Other */
@@ -445,19 +455,24 @@ typedef enum ClosureType {
 	CLOSURE_VOLUME_ABSORPTION_ID,
 	CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID,
 
+	CLOSURE_BSDF_PRINCIPLED_ID,
+
 	NBUILTIN_CLOSURES
 } ClosureType;
 
 /* watch this, being lazy with memory usage */
 #define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
 #define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
-#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
-#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
-#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID)
+#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
+#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
+#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
+#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID)
 #define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
 #define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
                                             type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
-											type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
+                                            type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
+#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\
+                                          (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID))
 #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
 #define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
 #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
@@ -466,7 +481,8 @@ typedef enum ClosureType {
 #define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
 #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
 #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
-#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
+#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
+#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID)
 
 #define CLOSURE_WEIGHT_CUTOFF 1e-5f
 
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 4c32130d06d..4e92f27acdb 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 	NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito;
 	
 	Transform tfm;
-	bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE);
+	bool is_object = (sd->object != OBJECT_NONE);
 	bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL);
 	
 	/* From world */
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index a8b3604a8a7..d967516a5c9 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -42,24 +42,8 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
 		tfm.w = read_node_float(kg, offset);
 		co = transform_point(&tfm, co);
 	}
-	float4 r;
-#  if defined(__KERNEL_CUDA__)
-#    if __CUDA_ARCH__ >= 300
-	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
-	if(id < 2048) /* TODO(dingto): Make this a variable */
-		r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z);
-	else {
-		float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z);
-		r = make_float4(f, f, f, 1.0f);
-	}
-#    else /* __CUDA_ARCH__ >= 300 */
-	r = volume_image_texture_3d(id, co.x, co.y, co.z);
-#    endif
-#  elif defined(__KERNEL_OPENCL__)
-	r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z);
-#  else
-	r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
-#  endif /* __KERNEL_CUDA__ */
+
+	float4 r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z, INTERPOLATION_NONE);
 #else
 	float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 #endif
diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h
index 57030f3979d..855b356b397 100644
--- a/intern/cycles/kernel/svm/svm_wavelength.h
+++ b/intern/cycles/kernel/svm/svm_wavelength.h
@@ -34,44 +34,44 @@ CCL_NAMESPACE_BEGIN
 
 /* Wavelength to RGB */
 
+// CIE colour matching functions xBar, yBar, and zBar for
+//	 wavelengths from 380 through 780 nanometers, every 5
+//	 nanometers.  For a wavelength lambda in this range:
+//		  cie_colour_match[(lambda - 380) / 5][0] = xBar
+//		  cie_colour_match[(lambda - 380) / 5][1] = yBar
+//		  cie_colour_match[(lambda - 380) / 5][2] = zBar
+ccl_static_constant float cie_colour_match[81][3] = {
+	{0.0014f,0.0000f,0.0065f}, {0.0022f,0.0001f,0.0105f}, {0.0042f,0.0001f,0.0201f},
+	{0.0076f,0.0002f,0.0362f}, {0.0143f,0.0004f,0.0679f}, {0.0232f,0.0006f,0.1102f},
+	{0.0435f,0.0012f,0.2074f}, {0.0776f,0.0022f,0.3713f}, {0.1344f,0.0040f,0.6456f},
+	{0.2148f,0.0073f,1.0391f}, {0.2839f,0.0116f,1.3856f}, {0.3285f,0.0168f,1.6230f},
+	{0.3483f,0.0230f,1.7471f}, {0.3481f,0.0298f,1.7826f}, {0.3362f,0.0380f,1.7721f},
+	{0.3187f,0.0480f,1.7441f}, {0.2908f,0.0600f,1.6692f}, {0.2511f,0.0739f,1.5281f},
+	{0.1954f,0.0910f,1.2876f}, {0.1421f,0.1126f,1.0419f}, {0.0956f,0.1390f,0.8130f},
+	{0.0580f,0.1693f,0.6162f}, {0.0320f,0.2080f,0.4652f}, {0.0147f,0.2586f,0.3533f},
+	{0.0049f,0.3230f,0.2720f}, {0.0024f,0.4073f,0.2123f}, {0.0093f,0.5030f,0.1582f},
+	{0.0291f,0.6082f,0.1117f}, {0.0633f,0.7100f,0.0782f}, {0.1096f,0.7932f,0.0573f},
+	{0.1655f,0.8620f,0.0422f}, {0.2257f,0.9149f,0.0298f}, {0.2904f,0.9540f,0.0203f},
+	{0.3597f,0.9803f,0.0134f}, {0.4334f,0.9950f,0.0087f}, {0.5121f,1.0000f,0.0057f},
+	{0.5945f,0.9950f,0.0039f}, {0.6784f,0.9786f,0.0027f}, {0.7621f,0.9520f,0.0021f},
+	{0.8425f,0.9154f,0.0018f}, {0.9163f,0.8700f,0.0017f}, {0.9786f,0.8163f,0.0014f},
+	{1.0263f,0.7570f,0.0011f}, {1.0567f,0.6949f,0.0010f}, {1.0622f,0.6310f,0.0008f},
+	{1.0456f,0.5668f,0.0006f}, {1.0026f,0.5030f,0.0003f}, {0.9384f,0.4412f,0.0002f},
+	{0.8544f,0.3810f,0.0002f}, {0.7514f,0.3210f,0.0001f}, {0.6424f,0.2650f,0.0000f},
+	{0.5419f,0.2170f,0.0000f}, {0.4479f,0.1750f,0.0000f}, {0.3608f,0.1382f,0.0000f},
+	{0.2835f,0.1070f,0.0000f}, {0.2187f,0.0816f,0.0000f}, {0.1649f,0.0610f,0.0000f},
+	{0.1212f,0.0446f,0.0000f}, {0.0874f,0.0320f,0.0000f}, {0.0636f,0.0232f,0.0000f},
+	{0.0468f,0.0170f,0.0000f}, {0.0329f,0.0119f,0.0000f}, {0.0227f,0.0082f,0.0000f},
+	{0.0158f,0.0057f,0.0000f}, {0.0114f,0.0041f,0.0000f}, {0.0081f,0.0029f,0.0000f},
+	{0.0058f,0.0021f,0.0000f}, {0.0041f,0.0015f,0.0000f}, {0.0029f,0.0010f,0.0000f},
+	{0.0020f,0.0007f,0.0000f}, {0.0014f,0.0005f,0.0000f}, {0.0010f,0.0004f,0.0000f},
+	{0.0007f,0.0002f,0.0000f}, {0.0005f,0.0002f,0.0000f}, {0.0003f,0.0001f,0.0000f},
+	{0.0002f,0.0001f,0.0000f}, {0.0002f,0.0001f,0.0000f}, {0.0001f,0.0000f,0.0000f},
+	{0.0001f,0.0000f,0.0000f}, {0.0001f,0.0000f,0.0000f}, {0.0000f,0.0000f,0.0000f}
+};
+
 ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelength, uint color_out)
 {	
-	// CIE colour matching functions xBar, yBar, and zBar for
-	//	 wavelengths from 380 through 780 nanometers, every 5
-	//	 nanometers.  For a wavelength lambda in this range:
-	//		  cie_colour_match[(lambda - 380) / 5][0] = xBar
-	//		  cie_colour_match[(lambda - 380) / 5][1] = yBar
-	//		  cie_colour_match[(lambda - 380) / 5][2] = zBar
-	const float cie_colour_match[81][3] = {
-		{0.0014f,0.0000f,0.0065f}, {0.0022f,0.0001f,0.0105f}, {0.0042f,0.0001f,0.0201f},
-		{0.0076f,0.0002f,0.0362f}, {0.0143f,0.0004f,0.0679f}, {0.0232f,0.0006f,0.1102f},
-		{0.0435f,0.0012f,0.2074f}, {0.0776f,0.0022f,0.3713f}, {0.1344f,0.0040f,0.6456f},
-		{0.2148f,0.0073f,1.0391f}, {0.2839f,0.0116f,1.3856f}, {0.3285f,0.0168f,1.6230f},
-		{0.3483f,0.0230f,1.7471f}, {0.3481f,0.0298f,1.7826f}, {0.3362f,0.0380f,1.7721f},
-		{0.3187f,0.0480f,1.7441f}, {0.2908f,0.0600f,1.6692f}, {0.2511f,0.0739f,1.5281f},
-		{0.1954f,0.0910f,1.2876f}, {0.1421f,0.1126f,1.0419f}, {0.0956f,0.1390f,0.8130f},
-		{0.0580f,0.1693f,0.6162f}, {0.0320f,0.2080f,0.4652f}, {0.0147f,0.2586f,0.3533f},
-		{0.0049f,0.3230f,0.2720f}, {0.0024f,0.4073f,0.2123f}, {0.0093f,0.5030f,0.1582f},
-		{0.0291f,0.6082f,0.1117f}, {0.0633f,0.7100f,0.0782f}, {0.1096f,0.7932f,0.0573f},
-		{0.1655f,0.8620f,0.0422f}, {0.2257f,0.9149f,0.0298f}, {0.2904f,0.9540f,0.0203f},
-		{0.3597f,0.9803f,0.0134f}, {0.4334f,0.9950f,0.0087f}, {0.5121f,1.0000f,0.0057f},
-		{0.5945f,0.9950f,0.0039f}, {0.6784f,0.9786f,0.0027f}, {0.7621f,0.9520f,0.0021f},
-		{0.8425f,0.9154f,0.0018f}, {0.9163f,0.8700f,0.0017f}, {0.9786f,0.8163f,0.0014f},
-		{1.0263f,0.7570f,0.0011f}, {1.0567f,0.6949f,0.0010f}, {1.0622f,0.6310f,0.0008f},
-		{1.0456f,0.5668f,0.0006f}, {1.0026f,0.5030f,0.0003f}, {0.9384f,0.4412f,0.0002f},
-		{0.8544f,0.3810f,0.0002f}, {0.7514f,0.3210f,0.0001f}, {0.6424f,0.2650f,0.0000f},
-		{0.5419f,0.2170f,0.0000f}, {0.4479f,0.1750f,0.0000f}, {0.3608f,0.1382f,0.0000f},
-		{0.2835f,0.1070f,0.0000f}, {0.2187f,0.0816f,0.0000f}, {0.1649f,0.0610f,0.0000f},
-		{0.1212f,0.0446f,0.0000f}, {0.0874f,0.0320f,0.0000f}, {0.0636f,0.0232f,0.0000f},
-		{0.0468f,0.0170f,0.0000f}, {0.0329f,0.0119f,0.0000f}, {0.0227f,0.0082f,0.0000f},
-		{0.0158f,0.0057f,0.0000f}, {0.0114f,0.0041f,0.0000f}, {0.0081f,0.0029f,0.0000f},
-		{0.0058f,0.0021f,0.0000f}, {0.0041f,0.0015f,0.0000f}, {0.0029f,0.0010f,0.0000f},
-		{0.0020f,0.0007f,0.0000f}, {0.0014f,0.0005f,0.0000f}, {0.0010f,0.0004f,0.0000f},
-		{0.0007f,0.0002f,0.0000f}, {0.0005f,0.0002f,0.0000f}, {0.0003f,0.0001f,0.0000f},
-		{0.0002f,0.0001f,0.0000f}, {0.0002f,0.0001f,0.0000f}, {0.0001f,0.0000f,0.0000f},
-		{0.0001f,0.0000f,0.0000f}, {0.0001f,0.0000f,0.0000f}, {0.0000f,0.0000f,0.0000f}
-	};
-
 	float lambda_nm = stack_load_float(stack, wavelength);
 	float ii = (lambda_nm-380.0f) * (1.0f/5.0f);  // scaled 0..80
 	int i = float_to_int(ii);
@@ -82,7 +82,7 @@ ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelengt
 	}
 	else {
 		ii -= i;
-		const float *c = cie_colour_match[i];
+		ccl_constant float *c = cie_colour_match[i];
 		color = interp(make_float3(c[0], c[1], c[2]), make_float3(c[3], c[4], c[5]), ii);
 	}
 	
@@ -92,8 +92,7 @@ ccl_device void svm_node_wavelength(ShaderData *sd, float *stack, uint wavelengt
 	/* Clamp to zero if values are smaller */
 	color = max(color, make_float3(0.0f, 0.0f, 0.0f));
 
-	if(stack_valid(color_out))
-		stack_store_float3(stack, color_out, color);
+	stack_store_float3(stack, color_out, color);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 6eed9bc1a99..3c6353c8001 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -41,9 +41,9 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
                                   float3 *P)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)
+	if(sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
 #else
-	if(ccl_fetch(sd, prim) != PRIM_NONE)
+	if(sd->prim != PRIM_NONE)
 #endif
 	{
 		float3 Co[3];
@@ -52,12 +52,12 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
 		/* Triangles */
 		int np = 3;
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE)
-			triangle_vertices(kg, ccl_fetch(sd, prim), Co);
+		if(sd->type & PRIMITIVE_TRIANGLE)
+			triangle_vertices(kg, sd->prim, Co);
 		else
-			motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co);
+			motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co);
 
-		if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) {
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &Co[0]);
 			object_position_transform(kg, sd, &Co[1]);
 			object_position_transform(kg, sd, &Co[2]);
@@ -66,8 +66,8 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
 		if(pixel_size) {
 			// Project the derivatives of P to the viewing plane defined
 			// by I so we have a measure of how big is a pixel at this point
-			float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
-			float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
+			float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I);
+			float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I);
 			// Take the average of both axis' length
 			pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f;
 		}
@@ -113,20 +113,20 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg,
 	 * With OpenCL 2.0 it's possible to avoid this change, but for until
 	 * then we'll be living with such an exception.
 	 */
-	float3 P = ccl_fetch(sd, P);
+	float3 P = sd->P;
 	float f = wireframe(kg, sd, size, pixel_size, &P);
 #else
-	float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P));
+	float f = wireframe(kg, sd, size, pixel_size, &sd->P);
 #endif
 
 	/* TODO(sergey): Think of faster way to calculate derivatives. */
 	if(bump_offset == NODE_BUMP_OFFSET_DX) {
-		float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx;
-		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx);
+		float3 Px = sd->P - sd->dP.dx;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx);
 	}
 	else if(bump_offset == NODE_BUMP_OFFSET_DY) {
-		float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy;
-		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy);
+		float3 Py = sd->P - sd->dP.dy;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy);
 	}
 
 	if(stack_valid(out_fac))